hahaa
This commit is contained in:
281
parse_html.py
Normal file
281
parse_html.py
Normal file
@@ -0,0 +1,281 @@
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
import io
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 设置标准输出编码为UTF-8,避免Windows控制台编码问题
|
||||
if sys.platform == 'win32':
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def extract_video_from_meta(html_content):
|
||||
"""
|
||||
从HTML的meta标签中提取视频信息
|
||||
|
||||
Args:
|
||||
html_content: HTML内容字符串
|
||||
|
||||
Returns:
|
||||
dict: 视频信息字典,如果没有找到则返回None
|
||||
"""
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
video_info = {}
|
||||
|
||||
# 提取og:video标签
|
||||
og_video = soup.find('meta', {'name': 'og:video'})
|
||||
if og_video and og_video.get('content'):
|
||||
video_info['url'] = og_video.get('content')
|
||||
|
||||
# 提取视频时长
|
||||
og_videotime = soup.find('meta', {'name': 'og:videotime'})
|
||||
if og_videotime and og_videotime.get('content'):
|
||||
video_info['time'] = og_videotime.get('content')
|
||||
|
||||
# 提取视频质量
|
||||
og_videoquality = soup.find('meta', {'name': 'og:videoquality'})
|
||||
if og_videoquality and og_videoquality.get('content'):
|
||||
video_info['quality'] = og_videoquality.get('content')
|
||||
|
||||
# 如果找到了视频URL,返回视频信息
|
||||
if video_info.get('url'):
|
||||
return video_info
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"从meta标签提取视频信息时出错:{e}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_html_file(html_file='1.html'):
|
||||
"""
|
||||
解析HTML文件,提取标题、描述、图片列表、视频列表和话题
|
||||
|
||||
Args:
|
||||
html_file: HTML文件路径
|
||||
|
||||
Returns:
|
||||
dict: 解析出的笔记数据
|
||||
"""
|
||||
try:
|
||||
# 读取HTML文件
|
||||
with open(html_file, 'r', encoding='utf-8') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# 提取meta标签中的视频信息
|
||||
video_info = extract_video_from_meta(html_content)
|
||||
|
||||
# 使用正则表达式提取window.__INITIAL_STATE__的内容
|
||||
pattern = r'<script>window\.__INITIAL_STATE__\s*=\s*({.*?});?\s*</script>'
|
||||
match = re.search(pattern, html_content, re.DOTALL)
|
||||
|
||||
if not match:
|
||||
print("未找到 window.__INITIAL_STATE__ 数据")
|
||||
# 如果只有视频信息,返回视频信息
|
||||
if video_info:
|
||||
return {'videos': [video_info]}
|
||||
return None
|
||||
|
||||
# 提取JSON字符串
|
||||
json_str = match.group(1)
|
||||
|
||||
# 处理JavaScript中的undefined值(Python JSON不支持undefined)
|
||||
json_str = re.sub(r'\bundefined\b', 'null', json_str)
|
||||
|
||||
# 解析JSON
|
||||
initial_state = json.loads(json_str)
|
||||
|
||||
# 提取笔记数据
|
||||
note_data = extract_note_data(initial_state)
|
||||
|
||||
# 如果提取到视频信息,添加到笔记数据中
|
||||
if video_info and note_data:
|
||||
if 'videos' not in note_data or not note_data['videos']:
|
||||
note_data['videos'] = []
|
||||
note_data['videos'].append(video_info)
|
||||
|
||||
return note_data
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"错误:找不到文件 {html_file}")
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON解析错误:{e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"解析错误:{e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_note_data(initial_state):
|
||||
"""
|
||||
从初始状态中提取笔记数据(只提取标题、描述、图片列表、视频列表和话题)
|
||||
|
||||
Args:
|
||||
initial_state: window.__INITIAL_STATE__ 解析后的字典
|
||||
|
||||
Returns:
|
||||
dict: 提取的笔记数据
|
||||
"""
|
||||
try:
|
||||
# 获取笔记详情
|
||||
note_store = initial_state.get('note', {})
|
||||
note_detail_map = note_store.get('noteDetailMap', {})
|
||||
|
||||
# 获取第一个笔记ID
|
||||
first_note_id = note_store.get('firstNoteId')
|
||||
if not first_note_id:
|
||||
# 如果没有firstNoteId,尝试获取noteDetailMap中的第一个key
|
||||
if note_detail_map:
|
||||
first_note_id = list(note_detail_map.keys())[0]
|
||||
else:
|
||||
print("未找到笔记ID")
|
||||
return None
|
||||
|
||||
# 获取笔记详情
|
||||
note_detail = note_detail_map.get(first_note_id, {})
|
||||
note_info = note_detail.get('note', {})
|
||||
|
||||
if not note_info:
|
||||
print("未找到笔记信息")
|
||||
return None
|
||||
|
||||
# 只提取需要的字段
|
||||
extracted_data = {
|
||||
'title': note_info.get('title'),
|
||||
'desc': note_info.get('desc'),
|
||||
'images': [],
|
||||
'videos': [],
|
||||
'topics': []
|
||||
}
|
||||
|
||||
# 提取图片信息
|
||||
image_list = note_info.get('imageList', [])
|
||||
for img in image_list:
|
||||
image_data = {
|
||||
'url': img.get('urlDefault') or img.get('url'),
|
||||
'urlPre': img.get('urlPre'),
|
||||
'width': img.get('width'),
|
||||
'height': img.get('height'),
|
||||
}
|
||||
extracted_data['images'].append(image_data)
|
||||
|
||||
# 提取视频信息(如果存在)
|
||||
video_info = note_info.get('video', {})
|
||||
if video_info:
|
||||
video_data = {}
|
||||
|
||||
# 尝试提取视频URL
|
||||
media = video_info.get('media', {})
|
||||
if media:
|
||||
stream = media.get('stream', {})
|
||||
if stream:
|
||||
hls = stream.get('hls', {})
|
||||
if hls:
|
||||
video_data['url'] = hls.get('masterUrl') or hls.get('url')
|
||||
# 如果没有hls,尝试其他字段
|
||||
if not video_data.get('url'):
|
||||
video_data['url'] = media.get('url') or media.get('videoUrl')
|
||||
|
||||
# 提取视频封面
|
||||
if video_info.get('cover'):
|
||||
video_data['cover'] = video_info.get('cover')
|
||||
|
||||
# 提取视频时长
|
||||
if video_info.get('time'):
|
||||
video_data['time'] = video_info.get('time')
|
||||
|
||||
if video_data.get('url'):
|
||||
extracted_data['videos'].append(video_data)
|
||||
|
||||
# 提取话题信息
|
||||
# 话题可能在多个位置,尝试不同的字段名
|
||||
topic_list = note_info.get('topicList', []) or note_info.get('tagList', []) or note_info.get('hashtagList', [])
|
||||
if topic_list:
|
||||
for topic in topic_list:
|
||||
topic_data = {
|
||||
'name': topic.get('name') or topic.get('title') or topic.get('tagName'),
|
||||
'id': topic.get('id') or topic.get('topicId') or topic.get('tagId'),
|
||||
}
|
||||
if topic_data.get('name'):
|
||||
extracted_data['topics'].append(topic_data)
|
||||
|
||||
# 如果描述中包含话题(#话题#格式),也提取出来
|
||||
desc = note_info.get('desc', '')
|
||||
if desc:
|
||||
# 使用正则表达式提取 #话题# 格式
|
||||
topic_pattern = r'#([^#]+)#'
|
||||
matches = re.findall(topic_pattern, desc)
|
||||
for match in matches:
|
||||
# 避免重复添加
|
||||
if not any(t.get('name') == match for t in extracted_data['topics']):
|
||||
extracted_data['topics'].append({'name': match})
|
||||
|
||||
return extracted_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"提取笔记数据时出错:{e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
def print_note_data(note_data):
|
||||
"""
|
||||
格式化打印笔记数据
|
||||
|
||||
Args:
|
||||
note_data: 笔记数据字典
|
||||
"""
|
||||
if not note_data:
|
||||
print("没有可显示的数据")
|
||||
return
|
||||
|
||||
print("=" * 60)
|
||||
print("笔记信息")
|
||||
print("=" * 60)
|
||||
print(f"标题: {note_data.get('title')}")
|
||||
print()
|
||||
|
||||
print("描述:")
|
||||
desc = note_data.get('desc', '')
|
||||
if desc:
|
||||
print(desc)
|
||||
|
||||
print("图片列表:")
|
||||
images = note_data.get('images', [])
|
||||
print(f" 共 {len(images)} 张图片")
|
||||
print(images)
|
||||
|
||||
print("视频列表:")
|
||||
videos = note_data.get('videos', [])
|
||||
print(f" 共 {len(videos)} 个视频")
|
||||
print(videos)
|
||||
|
||||
print("话题列表:")
|
||||
topics = note_data.get('topics', [])
|
||||
print(f" 共 {len(topics)} 个话题")
|
||||
print(topics)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 解析HTML文件
|
||||
print("正在解析HTML文件...")
|
||||
note_data = parse_html_file('1.html')
|
||||
|
||||
if note_data:
|
||||
# 保存为JSON文件
|
||||
with open('note_data.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(note_data, f, ensure_ascii=False, indent=2)
|
||||
print("数据已保存到 note_data.json")
|
||||
print()
|
||||
|
||||
# 打印数据(尝试处理编码问题)
|
||||
try:
|
||||
print_note_data(note_data)
|
||||
except Exception as e:
|
||||
print(f"打印数据时出错(但数据已保存到JSON文件): {e}")
|
||||
print("请查看 note_data.json 文件获取完整数据")
|
||||
else:
|
||||
print("解析失败,请检查HTML文件")
|
||||
Reference in New Issue
Block a user