Files
ca_auto_table/chain/append_page2_from_coingecko.py

208 lines
5.7 KiB
Python
Raw Normal View History

2025-12-04 15:50:55 +08:00
import csv
import re
import time
from typing import List
import requests
def fetch_html(url: str, timeout: int = 15) -> str:
"""
获取目标页面 HTML 内容 UA 与重试
参数:
url: 目标页面地址
timeout: 请求超时时间
返回值:
页面 HTML 字符串若失败则抛出异常
"""
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
}
last_exc = None
for _ in range(3):
try:
resp = requests.get(url, headers=headers, timeout=timeout)
if resp.status_code == 200:
return resp.text
last_exc = Exception(f"HTTP {resp.status_code}")
except Exception as e:
last_exc = e
time.sleep(1.0)
raise last_exc if last_exc else Exception("未知的网络错误")
def parse_chain_names(html: str) -> List[str]:
"""
CoinGecko 链页 HTML 中解析链名称尽量兼容多种结构
参数:
html: 页面 HTML 字符串
返回值:
链名称列表按页面出现顺序
"""
names: List[str] = []
# 方案1匹配到指向链详情的链接文本
# /zh/chains/<slug> 或 /en/chains/<slug>
link_name_pattern = re.compile(
r'<a[^>]+href="/(?:zh|en)/chains/[^"]+"[^>]*>(.*?)</a>',
re.IGNORECASE | re.DOTALL,
)
for m in link_name_pattern.finditer(html):
text = re.sub(r"<[^>]+>", "", m.group(1)).strip()
if text:
names.append(text)
# 仅使用链接匹配,避免误采集页面其他导航/文案
# 去重并裁剪可能的非链名称噪音
cleaned: List[str] = []
for n in names:
# 过滤常见非名称文本
if n.lower() in {"chains", "tvl", "volume", "market share"}:
continue
if len(n) > 100:
continue
cleaned.append(n)
# 有些页面会包含额外的链接文本,尽量只保留前 100 条
return cleaned[:100]
def read_current_max_index(csv_path: str) -> int:
"""
读取当前 CSV 的最大排名第一列
参数:
csv_path: CSV 路径
返回值:
最大排名数字若仅有表头则返回 0
"""
try:
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
rows = list(reader)
max_idx = 0
for row in rows[1:]:
if not row:
continue
try:
max_idx = max(max_idx, int(row[0]))
except Exception:
pass
return max_idx
except FileNotFoundError:
return 0
def append_to_csv(csv_path: str, names: List[str], start_index: int) -> None:
"""
将解析到的链名称追加写入 CSV其余字段按未知/不适用填充
参数:
csv_path: CSV 路径
names: 链名称列表
start_index: 起始排名序号例如 101
"""
with open(csv_path, "a", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
for i, name in enumerate(names, start=0):
idx = start_index + i
writer.writerow(
[
idx,
name,
"未知",
"未知",
"待补充",
"",
"不适用",
"不适用",
]
)
def read_existing_names(csv_path: str) -> List[str]:
"""
读取 CSV 已存在的链名称列表第二列
参数:
csv_path: CSV 路径
返回值:
名称列表
"""
try:
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
next(reader, None) # 跳过表头
return [row[1] for row in reader if row and len(row) >= 2]
except FileNotFoundError:
return []
def main() -> None:
"""
抓取指定页的链信息并顺序追加到 CSV
说明:
- 默认抓取第 2 可通过命令行参数传入页码例如: `python script.py 3`
- 起始排名自动接续 CSV 当前最大排名避免重复编号
"""
import sys
page = 2
if len(sys.argv) >= 2:
try:
page = int(sys.argv[1])
except Exception:
pass
url_zh = f"https://www.coingecko.com/zh/chains?page={page}"
url_en = f"https://www.coingecko.com/en/chains?page={page}"
csv_path = "chain/top_100_chains.csv"
names: List[str] = []
# 合并不同语言页的解析结果,提高覆盖率
try:
html_zh = fetch_html(url_zh)
names.extend(parse_chain_names(html_zh))
except Exception:
pass
try:
html_en = fetch_html(url_en)
names.extend(parse_chain_names(html_en))
except Exception:
pass
# 去重保持顺序
seen = set()
names = [n for n in names if not (n in seen or seen.add(n))]
if not names:
raise RuntimeError("未解析到链名称,请稍后重试或更换解析策略")
# 仅追加当前 CSV 尚不存在的名称,避免重复
existing = set(read_existing_names(csv_path))
names_to_append = [n for n in names if n not in existing]
if not names_to_append:
return
start_index = read_current_max_index(csv_path) + 1
append_to_csv(csv_path, names_to_append, start_index)
if __name__ == "__main__":
main()