208 lines
5.7 KiB
Python
208 lines
5.7 KiB
Python
import csv
|
||
import re
|
||
import time
|
||
from typing import List
|
||
|
||
import requests
|
||
|
||
|
||
def fetch_html(url: str, timeout: int = 15) -> str:
|
||
"""
|
||
获取目标页面 HTML 内容(带 UA 与重试)
|
||
|
||
参数:
|
||
url: 目标页面地址
|
||
timeout: 请求超时时间(秒)
|
||
|
||
返回值:
|
||
页面 HTML 字符串,若失败则抛出异常
|
||
"""
|
||
headers = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/120.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Cache-Control": "no-cache",
|
||
"Pragma": "no-cache",
|
||
}
|
||
last_exc = None
|
||
for _ in range(3):
|
||
try:
|
||
resp = requests.get(url, headers=headers, timeout=timeout)
|
||
if resp.status_code == 200:
|
||
return resp.text
|
||
last_exc = Exception(f"HTTP {resp.status_code}")
|
||
except Exception as e:
|
||
last_exc = e
|
||
time.sleep(1.0)
|
||
raise last_exc if last_exc else Exception("未知的网络错误")
|
||
|
||
|
||
def parse_chain_names(html: str) -> List[str]:
|
||
"""
|
||
从 CoinGecko 链页 HTML 中解析链名称(尽量兼容多种结构)
|
||
|
||
参数:
|
||
html: 页面 HTML 字符串
|
||
|
||
返回值:
|
||
链名称列表(按页面出现顺序)
|
||
"""
|
||
names: List[str] = []
|
||
|
||
# 方案1:匹配到指向链详情的链接文本
|
||
# /zh/chains/<slug> 或 /en/chains/<slug>
|
||
link_name_pattern = re.compile(
|
||
r'<a[^>]+href="/(?:zh|en)/chains/[^"]+"[^>]*>(.*?)</a>',
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
for m in link_name_pattern.finditer(html):
|
||
text = re.sub(r"<[^>]+>", "", m.group(1)).strip()
|
||
if text:
|
||
names.append(text)
|
||
|
||
# 仅使用链接匹配,避免误采集页面其他导航/文案
|
||
|
||
# 去重并裁剪可能的非链名称噪音
|
||
cleaned: List[str] = []
|
||
for n in names:
|
||
# 过滤常见非名称文本
|
||
if n.lower() in {"chains", "tvl", "volume", "market share"}:
|
||
continue
|
||
if len(n) > 100:
|
||
continue
|
||
cleaned.append(n)
|
||
|
||
# 有些页面会包含额外的链接文本,尽量只保留前 100 条
|
||
return cleaned[:100]
|
||
|
||
|
||
def read_current_max_index(csv_path: str) -> int:
|
||
"""
|
||
读取当前 CSV 的最大排名(第一列)
|
||
|
||
参数:
|
||
csv_path: CSV 路径
|
||
|
||
返回值:
|
||
最大排名数字,若仅有表头则返回 0
|
||
"""
|
||
try:
|
||
with open(csv_path, "r", encoding="utf-8") as f:
|
||
reader = csv.reader(f)
|
||
rows = list(reader)
|
||
max_idx = 0
|
||
for row in rows[1:]:
|
||
if not row:
|
||
continue
|
||
try:
|
||
max_idx = max(max_idx, int(row[0]))
|
||
except Exception:
|
||
pass
|
||
return max_idx
|
||
except FileNotFoundError:
|
||
return 0
|
||
|
||
|
||
def append_to_csv(csv_path: str, names: List[str], start_index: int) -> None:
|
||
"""
|
||
将解析到的链名称追加写入 CSV(其余字段按未知/不适用填充)
|
||
|
||
参数:
|
||
csv_path: CSV 路径
|
||
names: 链名称列表
|
||
start_index: 起始排名序号(例如 101)
|
||
"""
|
||
with open(csv_path, "a", encoding="utf-8", newline="") as f:
|
||
writer = csv.writer(f)
|
||
for i, name in enumerate(names, start=0):
|
||
idx = start_index + i
|
||
writer.writerow(
|
||
[
|
||
idx,
|
||
name,
|
||
"未知",
|
||
"未知",
|
||
"待补充",
|
||
"否",
|
||
"不适用",
|
||
"不适用",
|
||
]
|
||
)
|
||
|
||
|
||
def read_existing_names(csv_path: str) -> List[str]:
|
||
"""
|
||
读取 CSV 已存在的链名称列表(第二列)
|
||
|
||
参数:
|
||
csv_path: CSV 路径
|
||
|
||
返回值:
|
||
名称列表
|
||
"""
|
||
try:
|
||
with open(csv_path, "r", encoding="utf-8") as f:
|
||
reader = csv.reader(f)
|
||
next(reader, None) # 跳过表头
|
||
return [row[1] for row in reader if row and len(row) >= 2]
|
||
except FileNotFoundError:
|
||
return []
|
||
|
||
|
||
def main() -> None:
|
||
"""
|
||
抓取指定页的链信息并顺序追加到 CSV
|
||
|
||
说明:
|
||
- 默认抓取第 2 页;可通过命令行参数传入页码,例如: `python script.py 3`
|
||
- 起始排名自动接续 CSV 当前最大排名,避免重复编号
|
||
"""
|
||
import sys
|
||
|
||
page = 2
|
||
if len(sys.argv) >= 2:
|
||
try:
|
||
page = int(sys.argv[1])
|
||
except Exception:
|
||
pass
|
||
|
||
url_zh = f"https://www.coingecko.com/zh/chains?page={page}"
|
||
url_en = f"https://www.coingecko.com/en/chains?page={page}"
|
||
csv_path = "chain/top_100_chains.csv"
|
||
|
||
names: List[str] = []
|
||
# 合并不同语言页的解析结果,提高覆盖率
|
||
try:
|
||
html_zh = fetch_html(url_zh)
|
||
names.extend(parse_chain_names(html_zh))
|
||
except Exception:
|
||
pass
|
||
try:
|
||
html_en = fetch_html(url_en)
|
||
names.extend(parse_chain_names(html_en))
|
||
except Exception:
|
||
pass
|
||
# 去重保持顺序
|
||
seen = set()
|
||
names = [n for n in names if not (n in seen or seen.add(n))]
|
||
|
||
if not names:
|
||
raise RuntimeError("未解析到链名称,请稍后重试或更换解析策略")
|
||
|
||
# 仅追加当前 CSV 尚不存在的名称,避免重复
|
||
existing = set(read_existing_names(csv_path))
|
||
names_to_append = [n for n in names if n not in existing]
|
||
if not names_to_append:
|
||
return
|
||
|
||
start_index = read_current_max_index(csv_path) + 1
|
||
append_to_csv(csv_path, names_to_append, start_index)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|