282 lines
8.8 KiB
Python
282 lines
8.8 KiB
Python
|
|
import re
|
|||
|
|
import json
|
|||
|
|
import sys
|
|||
|
|
import io
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
|
|||
|
|
# 设置标准输出编码为UTF-8,避免Windows控制台编码问题
|
|||
|
|
if sys.platform == 'win32':
|
|||
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_video_from_meta(html_content):
|
|||
|
|
"""
|
|||
|
|
从HTML的meta标签中提取视频信息
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
html_content: HTML内容字符串
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 视频信息字典,如果没有找到则返回None
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|||
|
|
video_info = {}
|
|||
|
|
|
|||
|
|
# 提取og:video标签
|
|||
|
|
og_video = soup.find('meta', {'name': 'og:video'})
|
|||
|
|
if og_video and og_video.get('content'):
|
|||
|
|
video_info['url'] = og_video.get('content')
|
|||
|
|
|
|||
|
|
# 提取视频时长
|
|||
|
|
og_videotime = soup.find('meta', {'name': 'og:videotime'})
|
|||
|
|
if og_videotime and og_videotime.get('content'):
|
|||
|
|
video_info['time'] = og_videotime.get('content')
|
|||
|
|
|
|||
|
|
# 提取视频质量
|
|||
|
|
og_videoquality = soup.find('meta', {'name': 'og:videoquality'})
|
|||
|
|
if og_videoquality and og_videoquality.get('content'):
|
|||
|
|
video_info['quality'] = og_videoquality.get('content')
|
|||
|
|
|
|||
|
|
# 如果找到了视频URL,返回视频信息
|
|||
|
|
if video_info.get('url'):
|
|||
|
|
return video_info
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"从meta标签提取视频信息时出错:{e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_html_file(html_file='1.html'):
|
|||
|
|
"""
|
|||
|
|
解析HTML文件,提取标题、描述、图片列表、视频列表和话题
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
html_file: HTML文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 解析出的笔记数据
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 读取HTML文件
|
|||
|
|
with open(html_file, 'r', encoding='utf-8') as f:
|
|||
|
|
html_content = f.read()
|
|||
|
|
|
|||
|
|
# 提取meta标签中的视频信息
|
|||
|
|
video_info = extract_video_from_meta(html_content)
|
|||
|
|
|
|||
|
|
# 使用正则表达式提取window.__INITIAL_STATE__的内容
|
|||
|
|
pattern = r'<script>window\.__INITIAL_STATE__\s*=\s*({.*?});?\s*</script>'
|
|||
|
|
match = re.search(pattern, html_content, re.DOTALL)
|
|||
|
|
|
|||
|
|
if not match:
|
|||
|
|
print("未找到 window.__INITIAL_STATE__ 数据")
|
|||
|
|
# 如果只有视频信息,返回视频信息
|
|||
|
|
if video_info:
|
|||
|
|
return {'videos': [video_info]}
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 提取JSON字符串
|
|||
|
|
json_str = match.group(1)
|
|||
|
|
|
|||
|
|
# 处理JavaScript中的undefined值(Python JSON不支持undefined)
|
|||
|
|
json_str = re.sub(r'\bundefined\b', 'null', json_str)
|
|||
|
|
|
|||
|
|
# 解析JSON
|
|||
|
|
initial_state = json.loads(json_str)
|
|||
|
|
|
|||
|
|
# 提取笔记数据
|
|||
|
|
note_data = extract_note_data(initial_state)
|
|||
|
|
|
|||
|
|
# 如果提取到视频信息,添加到笔记数据中
|
|||
|
|
if video_info and note_data:
|
|||
|
|
if 'videos' not in note_data or not note_data['videos']:
|
|||
|
|
note_data['videos'] = []
|
|||
|
|
note_data['videos'].append(video_info)
|
|||
|
|
|
|||
|
|
return note_data
|
|||
|
|
|
|||
|
|
except FileNotFoundError:
|
|||
|
|
print(f"错误:找不到文件 {html_file}")
|
|||
|
|
return None
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
print(f"JSON解析错误:{e}")
|
|||
|
|
return None
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"解析错误:{e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_note_data(initial_state):
|
|||
|
|
"""
|
|||
|
|
从初始状态中提取笔记数据(只提取标题、描述、图片列表、视频列表和话题)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
initial_state: window.__INITIAL_STATE__ 解析后的字典
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 提取的笔记数据
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 获取笔记详情
|
|||
|
|
note_store = initial_state.get('note', {})
|
|||
|
|
note_detail_map = note_store.get('noteDetailMap', {})
|
|||
|
|
|
|||
|
|
# 获取第一个笔记ID
|
|||
|
|
first_note_id = note_store.get('firstNoteId')
|
|||
|
|
if not first_note_id:
|
|||
|
|
# 如果没有firstNoteId,尝试获取noteDetailMap中的第一个key
|
|||
|
|
if note_detail_map:
|
|||
|
|
first_note_id = list(note_detail_map.keys())[0]
|
|||
|
|
else:
|
|||
|
|
print("未找到笔记ID")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 获取笔记详情
|
|||
|
|
note_detail = note_detail_map.get(first_note_id, {})
|
|||
|
|
note_info = note_detail.get('note', {})
|
|||
|
|
|
|||
|
|
if not note_info:
|
|||
|
|
print("未找到笔记信息")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 只提取需要的字段
|
|||
|
|
extracted_data = {
|
|||
|
|
'title': note_info.get('title'),
|
|||
|
|
'desc': note_info.get('desc'),
|
|||
|
|
'images': [],
|
|||
|
|
'videos': [],
|
|||
|
|
'topics': []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 提取图片信息
|
|||
|
|
image_list = note_info.get('imageList', [])
|
|||
|
|
for img in image_list:
|
|||
|
|
image_data = {
|
|||
|
|
'url': img.get('urlDefault') or img.get('url'),
|
|||
|
|
'urlPre': img.get('urlPre'),
|
|||
|
|
'width': img.get('width'),
|
|||
|
|
'height': img.get('height'),
|
|||
|
|
}
|
|||
|
|
extracted_data['images'].append(image_data)
|
|||
|
|
|
|||
|
|
# 提取视频信息(如果存在)
|
|||
|
|
video_info = note_info.get('video', {})
|
|||
|
|
if video_info:
|
|||
|
|
video_data = {}
|
|||
|
|
|
|||
|
|
# 尝试提取视频URL
|
|||
|
|
media = video_info.get('media', {})
|
|||
|
|
if media:
|
|||
|
|
stream = media.get('stream', {})
|
|||
|
|
if stream:
|
|||
|
|
hls = stream.get('hls', {})
|
|||
|
|
if hls:
|
|||
|
|
video_data['url'] = hls.get('masterUrl') or hls.get('url')
|
|||
|
|
# 如果没有hls,尝试其他字段
|
|||
|
|
if not video_data.get('url'):
|
|||
|
|
video_data['url'] = media.get('url') or media.get('videoUrl')
|
|||
|
|
|
|||
|
|
# 提取视频封面
|
|||
|
|
if video_info.get('cover'):
|
|||
|
|
video_data['cover'] = video_info.get('cover')
|
|||
|
|
|
|||
|
|
# 提取视频时长
|
|||
|
|
if video_info.get('time'):
|
|||
|
|
video_data['time'] = video_info.get('time')
|
|||
|
|
|
|||
|
|
if video_data.get('url'):
|
|||
|
|
extracted_data['videos'].append(video_data)
|
|||
|
|
|
|||
|
|
# 提取话题信息
|
|||
|
|
# 话题可能在多个位置,尝试不同的字段名
|
|||
|
|
topic_list = note_info.get('topicList', []) or note_info.get('tagList', []) or note_info.get('hashtagList', [])
|
|||
|
|
if topic_list:
|
|||
|
|
for topic in topic_list:
|
|||
|
|
topic_data = {
|
|||
|
|
'name': topic.get('name') or topic.get('title') or topic.get('tagName'),
|
|||
|
|
'id': topic.get('id') or topic.get('topicId') or topic.get('tagId'),
|
|||
|
|
}
|
|||
|
|
if topic_data.get('name'):
|
|||
|
|
extracted_data['topics'].append(topic_data)
|
|||
|
|
|
|||
|
|
# 如果描述中包含话题(#话题#格式),也提取出来
|
|||
|
|
desc = note_info.get('desc', '')
|
|||
|
|
if desc:
|
|||
|
|
# 使用正则表达式提取 #话题# 格式
|
|||
|
|
topic_pattern = r'#([^#]+)#'
|
|||
|
|
matches = re.findall(topic_pattern, desc)
|
|||
|
|
for match in matches:
|
|||
|
|
# 避免重复添加
|
|||
|
|
if not any(t.get('name') == match for t in extracted_data['topics']):
|
|||
|
|
extracted_data['topics'].append({'name': match})
|
|||
|
|
|
|||
|
|
return extracted_data
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"提取笔记数据时出错:{e}")
|
|||
|
|
import traceback
|
|||
|
|
traceback.print_exc()
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def print_note_data(note_data):
|
|||
|
|
"""
|
|||
|
|
格式化打印笔记数据
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
note_data: 笔记数据字典
|
|||
|
|
"""
|
|||
|
|
if not note_data:
|
|||
|
|
print("没有可显示的数据")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("笔记信息")
|
|||
|
|
print("=" * 60)
|
|||
|
|
print(f"标题: {note_data.get('title')}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print("描述:")
|
|||
|
|
desc = note_data.get('desc', '')
|
|||
|
|
if desc:
|
|||
|
|
print(desc)
|
|||
|
|
|
|||
|
|
print("图片列表:")
|
|||
|
|
images = note_data.get('images', [])
|
|||
|
|
print(f" 共 {len(images)} 张图片")
|
|||
|
|
print(images)
|
|||
|
|
|
|||
|
|
print("视频列表:")
|
|||
|
|
videos = note_data.get('videos', [])
|
|||
|
|
print(f" 共 {len(videos)} 个视频")
|
|||
|
|
print(videos)
|
|||
|
|
|
|||
|
|
print("话题列表:")
|
|||
|
|
topics = note_data.get('topics', [])
|
|||
|
|
print(f" 共 {len(topics)} 个话题")
|
|||
|
|
print(topics)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
# 解析HTML文件
|
|||
|
|
print("正在解析HTML文件...")
|
|||
|
|
note_data = parse_html_file('1.html')
|
|||
|
|
|
|||
|
|
if note_data:
|
|||
|
|
# 保存为JSON文件
|
|||
|
|
with open('note_data.json', 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(note_data, f, ensure_ascii=False, indent=2)
|
|||
|
|
print("数据已保存到 note_data.json")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 打印数据(尝试处理编码问题)
|
|||
|
|
try:
|
|||
|
|
print_note_data(note_data)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"打印数据时出错(但数据已保存到JSON文件): {e}")
|
|||
|
|
print("请查看 note_data.json 文件获取完整数据")
|
|||
|
|
else:
|
|||
|
|
print("解析失败,请检查HTML文件")
|