一键提取某源码网/网盘链接！Python 爬虫

飞扬 (UID: 67) 8月前 [复制链接]

帖子链接已复制到剪贴板

帖子已经有人评论啦，不支持删除！

863 4

您好，本帖含有隐藏内容，必须登录才能查看：登录或注册

支持提取蓝奏云 / 百度 / 夸克 / 123 云盘，标题+链接对应导出

保存格式：标题 | 网盘链接

py版本3.7+

依赖库：requests、beautifulsoup4

MAX_PAGE = 100 需要抓取的页数

抓取完毕后保存在桌面：title_pan_links.txt

一键提取某源码网/网盘链接！Python 爬虫

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
抓取 www.yydsym.com 
保存格式：标题 | 网盘链接
输出文件：桌面 title_pan_links.txt
"""
import os
import time
import random
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

MAX_PAGE   = 100
SLEEP      = lambda: random.uniform(1.0, 2.0)
TIMEOUT    = 10
LIST_URL   = "https://www.yydsym.com/page/{}"
SAVE_FILE  = os.path.join(os.path.expanduser("~"), "Desktop", "title_pan_links.txt")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

PAN_KEYS = ("lanzou", "lanzn", "pan.baidu", "quark.cn", "123pan.com",
            "alipan.com", "xunlei.com", "cloud.189.cn", "tc.qq.com")

def get_list(page):
    url = LIST_URL.format(page)
    resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    items = []
    for art in soup.select('article.post-grid'):
        a = art.select_one('h2.entry-title a')
        if not a:
            continue
        title = a.get_text(strip=True)
        link = urljoin(url, a["href"])
        items.append((title, link))
    return items

def extract_pan(html):
    soup = BeautifulSoup(html, "html.parser")
    pans = set()

    # 侧边栏按钮跳转
    for a in soup.select('a[href*="goto?down="]'):
        short = urljoin("https://www.yydsym.com", a["href"])
        try:
            r = requests.get(short, headers=HEADERS, allow_redirects=False, timeout=10)
            real = r.headers.get("Location", "") if r.status_code in (301, 302) else short
            pans.add(real)
        except Exception:
            continue

    # 正文所有 <a>
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if any(k in href for k in PAN_KEYS):
            pans.add(href)
    return pans

def main():
    exist = set()
    if os.path.isfile(SAVE_FILE):
        with open(SAVE_FILE, "r", encoding="utf-8") as f:
            for line in f:
                if "|" in line:
                    exist.add(line.split("|", 1)[1].strip())

    print("🔍 开始抓取列表页...")
    for p in range(1, MAX_PAGE + 1):
        try:
            items = get_list(p)
            print(f"  第 {p:3d} 页 | 本页 {len(items):2d} 篇")
        except Exception as e:
            print(f"  第 {p:3d} 页 | 错误：{e}")
            continue
        time.sleep(SLEEP())

        print("     扫描详情页...")
        for title, url in items:
            try:
                html = requests.get(url, headers=HEADERS, timeout=TIMEOUT).text
                pans = extract_pan(html)
                new = 0
                for u in pans:
                    if u in exist:
                        continue
                    new += 1
                    with open(SAVE_FILE, "a", encoding="utf-8") as f:
                        f.write(f"{title} | {u}\n")
                    exist.add(u)
                if new:
                    print(f"       +{new} 条 | {title[:30]}...")
            except Exception as e:
                print(f"       跳过 | {e}")
            time.sleep(SLEEP())

    print(f"\n✅ 全部完成！文件已保存 → {SAVE_FILE}")

if __name__ == "__main__":
    main()

这家伙太懒了，什么也没留下。