import requests
from bs4 import BeautifulSoup
import random
import html2text
import os
import re
import pdfkit
import time
import logging
import json
# user_agent库:每次执行一次访问随机选取一个 user_agent,防止过于频繁访问被禁止
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
# ... 其他 user agent ...
]
class CSDNSpider():
def __init__(self):
self.session = requests.Session()
self.headers = {
'User-Agent': random.choice(USER_AGENT_LIST),
'Referer': 'https://blog.csdn.net/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Cookie': 'uuid_tt_dd=10_28867322640-1634638612434-763342; dc_session_id=10_1634638612434.351143; c_first_ref=www.baidu.com; c_first_page=https%3A//blog.csdn.net/; dc_sid=96044c0a6a9786eff8e99a4b7542f67b'
}
self.md_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN导出\MD'
self.pdf_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN导出\PDF'
self.html_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN导出\HTML'
# 确保所有目录都存在
for dir_path in [self.md_dir, self.pdf_dir, self.html_dir]:
os.makedirs(dir_path, exist_ok=True)
# 读取并解析robots.txt
self.robots_rules = self.read_robots_txt()
def read_robots_txt(self):
robots_url = "https://blog.csdn.net/robots.txt"
response = self.session.get(robots_url, headers=self.headers)
if response.status_code == 200:
print("成功读取robots.txt")
return response.text
else:
print("无法读取robots.txt")
return ""
def is_allowed_by_robots(self, url):
# 简单解析robots.txt内容,检查是否允许访问给定的URL
disallowed_paths = []
for line in self.robots_rules.splitlines():
if line.startswith("Disallow:"):
path = line.split(":")[1].strip()
disallowed_paths.append(path)
for path in disallowed_paths:
if re.match(path.replace("*", ".*"), url):
print(f"URL '{url}' 被robots.txt禁止访问")
return False
return True
def fetch_article(self, url):
# 移除URL中的@符号
if url.startswith('@'):
url = url[1:]
if not self.is_allowed_by_robots(url):
print("根据robots.txt规则,无法访问此URL")
return None
match = re.match(r'https://blog\.csdn\.net/([^/]+)/article/details/(\d+)', url)
if not match:
print("无效的CSDN文章URL")
return None
author, article_id = match.groups()
standard_url = f"https://blog.csdn.net/{author}/article/details/{article_id}"
print(f"正在导出文章: {standard_url}")
time.sleep(random.uniform(1, 3))
response = self.session.get(url=standard_url, headers=self.headers)
response.encoding = "utf-8"
if response.status_code == 200:
print("成功获取页面")
return self.parse_article(response.text)
else:
print(f"获取页面失败,状态码: {response.status_code}")
return None
def parse_article(self, html):
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title-article')
if title:
title = title.text.strip()
print(f"找到文章标题: {title}")
else:
print("无法找到文章标题")
return None, None, None
content = soup.find('div', id="content_views")
if content:
print("找到文章内容")
h = html2text.HTML2Text()
h.ignore_links = False
markdown_content = h.handle(str(content))
return title, markdown_content, str(content)
else:
print("无法找到文章内容")
return None, None, None
def save_markdown(self, title, content):
clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
filename = os.path.join(self.md_dir, f"{clean_title}.md")
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"# {title}\n\n{content}")
return filename
def save_html(self, title, content):
clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
filename = os.path.join(self.html_dir, f"{clean_title}.html")
full_html = html_str.format(article=content)
with open(filename, 'w', encoding='utf-8') as f:
f.write(full_html)
return filename
def save_pdf(self, title, html_filename):
clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
filename = os.path.join(self.pdf_dir, f"{clean_title}.pdf")
config = pdfkit.configuration(wkhtmltopdf=r'D:\EdgeDownload\VSCode\wkhtmltopdf\bin\wkhtmltopdf.exe')
pdfkit.from_file(html_filename, filename, configuration=config)
return filename
def main(url):
spider = CSDNSpider()
result = spider.fetch_article(url)
if result:
title, markdown_content, html_content = result
md_filename = spider.save_markdown(title, markdown_content)
html_filename = spider.save_html(title, html_content)
pdf_filename = spider.save_pdf(title, html_filename)
print(f"文章 '{title}' 已成功导出并保存。")
print(f"Markdown文件: {md_filename}")
print(f"HTML文件: {html_filename}")
print(f"PDF文件: {pdf_filename}")
else:
print("无法获取文章内容。请检查URL是否正确或者文章是否可访问。")
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
article_url = input("请输入CSDN文章的URL: ")
main(article_url)
需要修改的地方:
- 文件保存目录
self.md_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN导出\MD'
self.pdf_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN导出\PDF'
self.html_dir = r'D:\EdgeDownload\VSCode\Documents\cursor-documents\CSDN导出\HTML'
- html转pdf
config = pdfkit.configuration(wkhtmltopdf=r'D:\EdgeDownload\VSCode\wkhtmltopdf\bin\wkhtmltopdf.exe')
- useragent更改可选
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
# ... 其他 user agent ...
]
参考useragent库
还有大佬给的这个
注意:
- 只是获取能获取的文章并转换形式保存,获取付费的文章?那我也想

- 直接运行报错多为缺少必要的库/依赖,扔给AI,就知道安装命令了(在开头),如果不行,请把shit扔我嘴里

如果在其他地方看到这个帖子,不好意思,也是我发的
仅供学习研究,请勿违法操作