282 lines
8.8 KiB
Python
282 lines
8.8 KiB
Python
import re
|
||
import json
|
||
import sys
|
||
import io
|
||
from bs4 import BeautifulSoup
|
||
|
||
# 设置标准输出编码为UTF-8,避免Windows控制台编码问题
|
||
if sys.platform == 'win32':
|
||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
||
|
||
|
||
def extract_video_from_meta(html_content):
|
||
"""
|
||
从HTML的meta标签中提取视频信息
|
||
|
||
Args:
|
||
html_content: HTML内容字符串
|
||
|
||
Returns:
|
||
dict: 视频信息字典,如果没有找到则返回None
|
||
"""
|
||
try:
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
video_info = {}
|
||
|
||
# 提取og:video标签
|
||
og_video = soup.find('meta', {'name': 'og:video'})
|
||
if og_video and og_video.get('content'):
|
||
video_info['url'] = og_video.get('content')
|
||
|
||
# 提取视频时长
|
||
og_videotime = soup.find('meta', {'name': 'og:videotime'})
|
||
if og_videotime and og_videotime.get('content'):
|
||
video_info['time'] = og_videotime.get('content')
|
||
|
||
# 提取视频质量
|
||
og_videoquality = soup.find('meta', {'name': 'og:videoquality'})
|
||
if og_videoquality and og_videoquality.get('content'):
|
||
video_info['quality'] = og_videoquality.get('content')
|
||
|
||
# 如果找到了视频URL,返回视频信息
|
||
if video_info.get('url'):
|
||
return video_info
|
||
|
||
return None
|
||
except Exception as e:
|
||
print(f"从meta标签提取视频信息时出错:{e}")
|
||
return None
|
||
|
||
|
||
def parse_html_file(html_file='1.html'):
|
||
"""
|
||
解析HTML文件,提取标题、描述、图片列表、视频列表和话题
|
||
|
||
Args:
|
||
html_file: HTML文件路径
|
||
|
||
Returns:
|
||
dict: 解析出的笔记数据
|
||
"""
|
||
try:
|
||
# 读取HTML文件
|
||
with open(html_file, 'r', encoding='utf-8') as f:
|
||
html_content = f.read()
|
||
|
||
# 提取meta标签中的视频信息
|
||
video_info = extract_video_from_meta(html_content)
|
||
|
||
# 使用正则表达式提取window.__INITIAL_STATE__的内容
|
||
pattern = r'<script>window\.__INITIAL_STATE__\s*=\s*({.*?});?\s*</script>'
|
||
match = re.search(pattern, html_content, re.DOTALL)
|
||
|
||
if not match:
|
||
print("未找到 window.__INITIAL_STATE__ 数据")
|
||
# 如果只有视频信息,返回视频信息
|
||
if video_info:
|
||
return {'videos': [video_info]}
|
||
return None
|
||
|
||
# 提取JSON字符串
|
||
json_str = match.group(1)
|
||
|
||
# 处理JavaScript中的undefined值(Python JSON不支持undefined)
|
||
json_str = re.sub(r'\bundefined\b', 'null', json_str)
|
||
|
||
# 解析JSON
|
||
initial_state = json.loads(json_str)
|
||
|
||
# 提取笔记数据
|
||
note_data = extract_note_data(initial_state)
|
||
|
||
# 如果提取到视频信息,添加到笔记数据中
|
||
if video_info and note_data:
|
||
if 'videos' not in note_data or not note_data['videos']:
|
||
note_data['videos'] = []
|
||
note_data['videos'].append(video_info)
|
||
|
||
return note_data
|
||
|
||
except FileNotFoundError:
|
||
print(f"错误:找不到文件 {html_file}")
|
||
return None
|
||
except json.JSONDecodeError as e:
|
||
print(f"JSON解析错误:{e}")
|
||
return None
|
||
except Exception as e:
|
||
print(f"解析错误:{e}")
|
||
return None
|
||
|
||
|
||
def extract_note_data(initial_state):
|
||
"""
|
||
从初始状态中提取笔记数据(只提取标题、描述、图片列表、视频列表和话题)
|
||
|
||
Args:
|
||
initial_state: window.__INITIAL_STATE__ 解析后的字典
|
||
|
||
Returns:
|
||
dict: 提取的笔记数据
|
||
"""
|
||
try:
|
||
# 获取笔记详情
|
||
note_store = initial_state.get('note', {})
|
||
note_detail_map = note_store.get('noteDetailMap', {})
|
||
|
||
# 获取第一个笔记ID
|
||
first_note_id = note_store.get('firstNoteId')
|
||
if not first_note_id:
|
||
# 如果没有firstNoteId,尝试获取noteDetailMap中的第一个key
|
||
if note_detail_map:
|
||
first_note_id = list(note_detail_map.keys())[0]
|
||
else:
|
||
print("未找到笔记ID")
|
||
return None
|
||
|
||
# 获取笔记详情
|
||
note_detail = note_detail_map.get(first_note_id, {})
|
||
note_info = note_detail.get('note', {})
|
||
|
||
if not note_info:
|
||
print("未找到笔记信息")
|
||
return None
|
||
|
||
# 只提取需要的字段
|
||
extracted_data = {
|
||
'title': note_info.get('title'),
|
||
'desc': note_info.get('desc'),
|
||
'images': [],
|
||
'videos': [],
|
||
'topics': []
|
||
}
|
||
|
||
# 提取图片信息
|
||
image_list = note_info.get('imageList', [])
|
||
for img in image_list:
|
||
image_data = {
|
||
'url': img.get('urlDefault') or img.get('url'),
|
||
'urlPre': img.get('urlPre'),
|
||
'width': img.get('width'),
|
||
'height': img.get('height'),
|
||
}
|
||
extracted_data['images'].append(image_data)
|
||
|
||
# 提取视频信息(如果存在)
|
||
video_info = note_info.get('video', {})
|
||
if video_info:
|
||
video_data = {}
|
||
|
||
# 尝试提取视频URL
|
||
media = video_info.get('media', {})
|
||
if media:
|
||
stream = media.get('stream', {})
|
||
if stream:
|
||
hls = stream.get('hls', {})
|
||
if hls:
|
||
video_data['url'] = hls.get('masterUrl') or hls.get('url')
|
||
# 如果没有hls,尝试其他字段
|
||
if not video_data.get('url'):
|
||
video_data['url'] = media.get('url') or media.get('videoUrl')
|
||
|
||
# 提取视频封面
|
||
if video_info.get('cover'):
|
||
video_data['cover'] = video_info.get('cover')
|
||
|
||
# 提取视频时长
|
||
if video_info.get('time'):
|
||
video_data['time'] = video_info.get('time')
|
||
|
||
if video_data.get('url'):
|
||
extracted_data['videos'].append(video_data)
|
||
|
||
# 提取话题信息
|
||
# 话题可能在多个位置,尝试不同的字段名
|
||
topic_list = note_info.get('topicList', []) or note_info.get('tagList', []) or note_info.get('hashtagList', [])
|
||
if topic_list:
|
||
for topic in topic_list:
|
||
topic_data = {
|
||
'name': topic.get('name') or topic.get('title') or topic.get('tagName'),
|
||
'id': topic.get('id') or topic.get('topicId') or topic.get('tagId'),
|
||
}
|
||
if topic_data.get('name'):
|
||
extracted_data['topics'].append(topic_data)
|
||
|
||
# 如果描述中包含话题(#话题#格式),也提取出来
|
||
desc = note_info.get('desc', '')
|
||
if desc:
|
||
# 使用正则表达式提取 #话题# 格式
|
||
topic_pattern = r'#([^#]+)#'
|
||
matches = re.findall(topic_pattern, desc)
|
||
for match in matches:
|
||
# 避免重复添加
|
||
if not any(t.get('name') == match for t in extracted_data['topics']):
|
||
extracted_data['topics'].append({'name': match})
|
||
|
||
return extracted_data
|
||
|
||
except Exception as e:
|
||
print(f"提取笔记数据时出错:{e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return None
|
||
|
||
|
||
def print_note_data(note_data):
|
||
"""
|
||
格式化打印笔记数据
|
||
|
||
Args:
|
||
note_data: 笔记数据字典
|
||
"""
|
||
if not note_data:
|
||
print("没有可显示的数据")
|
||
return
|
||
|
||
print("=" * 60)
|
||
print("笔记信息")
|
||
print("=" * 60)
|
||
print(f"标题: {note_data.get('title')}")
|
||
print()
|
||
|
||
print("描述:")
|
||
desc = note_data.get('desc', '')
|
||
if desc:
|
||
print(desc)
|
||
|
||
print("图片列表:")
|
||
images = note_data.get('images', [])
|
||
print(f" 共 {len(images)} 张图片")
|
||
print(images)
|
||
|
||
print("视频列表:")
|
||
videos = note_data.get('videos', [])
|
||
print(f" 共 {len(videos)} 个视频")
|
||
print(videos)
|
||
|
||
print("话题列表:")
|
||
topics = note_data.get('topics', [])
|
||
print(f" 共 {len(topics)} 个话题")
|
||
print(topics)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 解析HTML文件
|
||
print("正在解析HTML文件...")
|
||
note_data = parse_html_file('1.html')
|
||
|
||
if note_data:
|
||
# 保存为JSON文件
|
||
with open('note_data.json', 'w', encoding='utf-8') as f:
|
||
json.dump(note_data, f, ensure_ascii=False, indent=2)
|
||
print("数据已保存到 note_data.json")
|
||
print()
|
||
|
||
# 打印数据(尝试处理编码问题)
|
||
try:
|
||
print_note_data(note_data)
|
||
except Exception as e:
|
||
print(f"打印数据时出错(但数据已保存到JSON文件): {e}")
|
||
print("请查看 note_data.json 文件获取完整数据")
|
||
else:
|
||
print("解析失败,请检查HTML文件")
|