Files
haha/parse_html.py
2026-01-14 18:30:41 +08:00

282 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import json
import sys
import io
from bs4 import BeautifulSoup
# 设置标准输出编码为UTF-8避免Windows控制台编码问题
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
def extract_video_from_meta(html_content):
"""
从HTML的meta标签中提取视频信息
Args:
html_content: HTML内容字符串
Returns:
dict: 视频信息字典如果没有找到则返回None
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
video_info = {}
# 提取og:video标签
og_video = soup.find('meta', {'name': 'og:video'})
if og_video and og_video.get('content'):
video_info['url'] = og_video.get('content')
# 提取视频时长
og_videotime = soup.find('meta', {'name': 'og:videotime'})
if og_videotime and og_videotime.get('content'):
video_info['time'] = og_videotime.get('content')
# 提取视频质量
og_videoquality = soup.find('meta', {'name': 'og:videoquality'})
if og_videoquality and og_videoquality.get('content'):
video_info['quality'] = og_videoquality.get('content')
# 如果找到了视频URL返回视频信息
if video_info.get('url'):
return video_info
return None
except Exception as e:
print(f"从meta标签提取视频信息时出错{e}")
return None
def parse_html_file(html_file='1.html'):
"""
解析HTML文件提取标题、描述、图片列表、视频列表和话题
Args:
html_file: HTML文件路径
Returns:
dict: 解析出的笔记数据
"""
try:
# 读取HTML文件
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
# 提取meta标签中的视频信息
video_info = extract_video_from_meta(html_content)
# 使用正则表达式提取window.__INITIAL_STATE__的内容
pattern = r'<script>window\.__INITIAL_STATE__\s*=\s*({.*?});?\s*</script>'
match = re.search(pattern, html_content, re.DOTALL)
if not match:
print("未找到 window.__INITIAL_STATE__ 数据")
# 如果只有视频信息,返回视频信息
if video_info:
return {'videos': [video_info]}
return None
# 提取JSON字符串
json_str = match.group(1)
# 处理JavaScript中的undefined值Python JSON不支持undefined
json_str = re.sub(r'\bundefined\b', 'null', json_str)
# 解析JSON
initial_state = json.loads(json_str)
# 提取笔记数据
note_data = extract_note_data(initial_state)
# 如果提取到视频信息,添加到笔记数据中
if video_info and note_data:
if 'videos' not in note_data or not note_data['videos']:
note_data['videos'] = []
note_data['videos'].append(video_info)
return note_data
except FileNotFoundError:
print(f"错误:找不到文件 {html_file}")
return None
except json.JSONDecodeError as e:
print(f"JSON解析错误{e}")
return None
except Exception as e:
print(f"解析错误:{e}")
return None
def extract_note_data(initial_state):
"""
从初始状态中提取笔记数据(只提取标题、描述、图片列表、视频列表和话题)
Args:
initial_state: window.__INITIAL_STATE__ 解析后的字典
Returns:
dict: 提取的笔记数据
"""
try:
# 获取笔记详情
note_store = initial_state.get('note', {})
note_detail_map = note_store.get('noteDetailMap', {})
# 获取第一个笔记ID
first_note_id = note_store.get('firstNoteId')
if not first_note_id:
# 如果没有firstNoteId尝试获取noteDetailMap中的第一个key
if note_detail_map:
first_note_id = list(note_detail_map.keys())[0]
else:
print("未找到笔记ID")
return None
# 获取笔记详情
note_detail = note_detail_map.get(first_note_id, {})
note_info = note_detail.get('note', {})
if not note_info:
print("未找到笔记信息")
return None
# 只提取需要的字段
extracted_data = {
'title': note_info.get('title'),
'desc': note_info.get('desc'),
'images': [],
'videos': [],
'topics': []
}
# 提取图片信息
image_list = note_info.get('imageList', [])
for img in image_list:
image_data = {
'url': img.get('urlDefault') or img.get('url'),
'urlPre': img.get('urlPre'),
'width': img.get('width'),
'height': img.get('height'),
}
extracted_data['images'].append(image_data)
# 提取视频信息(如果存在)
video_info = note_info.get('video', {})
if video_info:
video_data = {}
# 尝试提取视频URL
media = video_info.get('media', {})
if media:
stream = media.get('stream', {})
if stream:
hls = stream.get('hls', {})
if hls:
video_data['url'] = hls.get('masterUrl') or hls.get('url')
# 如果没有hls尝试其他字段
if not video_data.get('url'):
video_data['url'] = media.get('url') or media.get('videoUrl')
# 提取视频封面
if video_info.get('cover'):
video_data['cover'] = video_info.get('cover')
# 提取视频时长
if video_info.get('time'):
video_data['time'] = video_info.get('time')
if video_data.get('url'):
extracted_data['videos'].append(video_data)
# 提取话题信息
# 话题可能在多个位置,尝试不同的字段名
topic_list = note_info.get('topicList', []) or note_info.get('tagList', []) or note_info.get('hashtagList', [])
if topic_list:
for topic in topic_list:
topic_data = {
'name': topic.get('name') or topic.get('title') or topic.get('tagName'),
'id': topic.get('id') or topic.get('topicId') or topic.get('tagId'),
}
if topic_data.get('name'):
extracted_data['topics'].append(topic_data)
# 如果描述中包含话题(#话题#格式),也提取出来
desc = note_info.get('desc', '')
if desc:
# 使用正则表达式提取 #话题# 格式
topic_pattern = r'#([^#]+)#'
matches = re.findall(topic_pattern, desc)
for match in matches:
# 避免重复添加
if not any(t.get('name') == match for t in extracted_data['topics']):
extracted_data['topics'].append({'name': match})
return extracted_data
except Exception as e:
print(f"提取笔记数据时出错:{e}")
import traceback
traceback.print_exc()
return None
def print_note_data(note_data):
"""
格式化打印笔记数据
Args:
note_data: 笔记数据字典
"""
if not note_data:
print("没有可显示的数据")
return
print("=" * 60)
print("笔记信息")
print("=" * 60)
print(f"标题: {note_data.get('title')}")
print()
print("描述:")
desc = note_data.get('desc', '')
if desc:
print(desc)
print("图片列表:")
images = note_data.get('images', [])
print(f"{len(images)} 张图片")
print(images)
print("视频列表:")
videos = note_data.get('videos', [])
print(f"{len(videos)} 个视频")
print(videos)
print("话题列表:")
topics = note_data.get('topics', [])
print(f"{len(topics)} 个话题")
print(topics)
if __name__ == '__main__':
# 解析HTML文件
print("正在解析HTML文件...")
note_data = parse_html_file('1.html')
if note_data:
# 保存为JSON文件
with open('note_data.json', 'w', encoding='utf-8') as f:
json.dump(note_data, f, ensure_ascii=False, indent=2)
print("数据已保存到 note_data.json")
print()
# 打印数据(尝试处理编码问题)
try:
print_note_data(note_data)
except Exception as e:
print(f"打印数据时出错但数据已保存到JSON文件: {e}")
print("请查看 note_data.json 文件获取完整数据")
else:
print("解析失败请检查HTML文件")