hahaa

2026-01-14 18:30:41 +08:00
commit 4f6e1f9175
1087 changed files with 4427 additions and 0 deletions
--- a/parse_html.py
+++ b/parse_html.py
@@ -0,0 +1,281 @@
+import re
+import json
+import sys
+import io
+from bs4 import BeautifulSoup
+
+# 设置标准输出编码为UTF-8，避免Windows控制台编码问题
+if sys.platform == 'win32':
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
+
+
+def extract_video_from_meta(html_content):
+    """
+    从HTML的meta标签中提取视频信息
+    
+    Args:
+        html_content: HTML内容字符串
+        
+    Returns:
+        dict: 视频信息字典，如果没有找到则返回None
+    """
+    try:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        video_info = {}
+
+        # 提取og:video标签
+        og_video = soup.find('meta', {'name': 'og:video'})
+        if og_video and og_video.get('content'):
+            video_info['url'] = og_video.get('content')
+
+        # 提取视频时长
+        og_videotime = soup.find('meta', {'name': 'og:videotime'})
+        if og_videotime and og_videotime.get('content'):
+            video_info['time'] = og_videotime.get('content')
+
+        # 提取视频质量
+        og_videoquality = soup.find('meta', {'name': 'og:videoquality'})
+        if og_videoquality and og_videoquality.get('content'):
+            video_info['quality'] = og_videoquality.get('content')
+
+        # 如果找到了视频URL，返回视频信息
+        if video_info.get('url'):
+            return video_info
+
+        return None
+    except Exception as e:
+        print(f"从meta标签提取视频信息时出错：{e}")
+        return None
+
+
+def parse_html_file(html_file='1.html'):
+    """
+    解析HTML文件，提取标题、描述、图片列表、视频列表和话题
+    
+    Args:
+        html_file: HTML文件路径
+        
+    Returns:
+        dict: 解析出的笔记数据
+    """
+    try:
+        # 读取HTML文件
+        with open(html_file, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+
+        # 提取meta标签中的视频信息
+        video_info = extract_video_from_meta(html_content)
+
+        # 使用正则表达式提取window.__INITIAL_STATE__的内容
+        pattern = r'<script>window\.__INITIAL_STATE__\s*=\s*({.*?});?\s*</script>'
+        match = re.search(pattern, html_content, re.DOTALL)
+
+        if not match:
+            print("未找到 window.__INITIAL_STATE__ 数据")
+            # 如果只有视频信息，返回视频信息
+            if video_info:
+                return {'videos': [video_info]}
+            return None
+
+        # 提取JSON字符串
+        json_str = match.group(1)
+
+        # 处理JavaScript中的undefined值（Python JSON不支持undefined）
+        json_str = re.sub(r'\bundefined\b', 'null', json_str)
+
+        # 解析JSON
+        initial_state = json.loads(json_str)
+
+        # 提取笔记数据
+        note_data = extract_note_data(initial_state)
+
+        # 如果提取到视频信息，添加到笔记数据中
+        if video_info and note_data:
+            if 'videos' not in note_data or not note_data['videos']:
+                note_data['videos'] = []
+            note_data['videos'].append(video_info)
+
+        return note_data
+
+    except FileNotFoundError:
+        print(f"错误：找不到文件 {html_file}")
+        return None
+    except json.JSONDecodeError as e:
+        print(f"JSON解析错误：{e}")
+        return None
+    except Exception as e:
+        print(f"解析错误：{e}")
+        return None
+
+
+def extract_note_data(initial_state):
+    """
+    从初始状态中提取笔记数据（只提取标题、描述、图片列表、视频列表和话题）
+    
+    Args:
+        initial_state: window.__INITIAL_STATE__ 解析后的字典
+        
+    Returns:
+        dict: 提取的笔记数据
+    """
+    try:
+        # 获取笔记详情
+        note_store = initial_state.get('note', {})
+        note_detail_map = note_store.get('noteDetailMap', {})
+
+        # 获取第一个笔记ID
+        first_note_id = note_store.get('firstNoteId')
+        if not first_note_id:
+            # 如果没有firstNoteId，尝试获取noteDetailMap中的第一个key
+            if note_detail_map:
+                first_note_id = list(note_detail_map.keys())[0]
+            else:
+                print("未找到笔记ID")
+                return None
+
+        # 获取笔记详情
+        note_detail = note_detail_map.get(first_note_id, {})
+        note_info = note_detail.get('note', {})
+
+        if not note_info:
+            print("未找到笔记信息")
+            return None
+
+        # 只提取需要的字段
+        extracted_data = {
+            'title': note_info.get('title'),
+            'desc': note_info.get('desc'),
+            'images': [],
+            'videos': [],
+            'topics': []
+        }
+
+        # 提取图片信息
+        image_list = note_info.get('imageList', [])
+        for img in image_list:
+            image_data = {
+                'url': img.get('urlDefault') or img.get('url'),
+                'urlPre': img.get('urlPre'),
+                'width': img.get('width'),
+                'height': img.get('height'),
+            }
+            extracted_data['images'].append(image_data)
+
+        # 提取视频信息（如果存在）
+        video_info = note_info.get('video', {})
+        if video_info:
+            video_data = {}
+
+            # 尝试提取视频URL
+            media = video_info.get('media', {})
+            if media:
+                stream = media.get('stream', {})
+                if stream:
+                    hls = stream.get('hls', {})
+                    if hls:
+                        video_data['url'] = hls.get('masterUrl') or hls.get('url')
+                # 如果没有hls，尝试其他字段
+                if not video_data.get('url'):
+                    video_data['url'] = media.get('url') or media.get('videoUrl')
+
+            # 提取视频封面
+            if video_info.get('cover'):
+                video_data['cover'] = video_info.get('cover')
+
+            # 提取视频时长
+            if video_info.get('time'):
+                video_data['time'] = video_info.get('time')
+
+            if video_data.get('url'):
+                extracted_data['videos'].append(video_data)
+
+        # 提取话题信息
+        # 话题可能在多个位置，尝试不同的字段名
+        topic_list = note_info.get('topicList', []) or note_info.get('tagList', []) or note_info.get('hashtagList', [])
+        if topic_list:
+            for topic in topic_list:
+                topic_data = {
+                    'name': topic.get('name') or topic.get('title') or topic.get('tagName'),
+                    'id': topic.get('id') or topic.get('topicId') or topic.get('tagId'),
+                }
+                if topic_data.get('name'):
+                    extracted_data['topics'].append(topic_data)
+
+        # 如果描述中包含话题（#话题#格式），也提取出来
+        desc = note_info.get('desc', '')
+        if desc:
+            # 使用正则表达式提取 #话题# 格式
+            topic_pattern = r'#([^#]+)#'
+            matches = re.findall(topic_pattern, desc)
+            for match in matches:
+                # 避免重复添加
+                if not any(t.get('name') == match for t in extracted_data['topics']):
+                    extracted_data['topics'].append({'name': match})
+
+        return extracted_data
+
+    except Exception as e:
+        print(f"提取笔记数据时出错：{e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+def print_note_data(note_data):
+    """
+    格式化打印笔记数据
+    
+    Args:
+        note_data: 笔记数据字典
+    """
+    if not note_data:
+        print("没有可显示的数据")
+        return
+
+    print("=" * 60)
+    print("笔记信息")
+    print("=" * 60)
+    print(f"标题: {note_data.get('title')}")
+    print()
+
+    print("描述:")
+    desc = note_data.get('desc', '')
+    if desc:
+        print(desc)
+
+    print("图片列表:")
+    images = note_data.get('images', [])
+    print(f"  共 {len(images)} 张图片")
+    print(images)
+
+    print("视频列表:")
+    videos = note_data.get('videos', [])
+    print(f"  共 {len(videos)} 个视频")
+    print(videos)
+
+    print("话题列表:")
+    topics = note_data.get('topics', [])
+    print(f"  共 {len(topics)} 个话题")
+    print(topics)
+
+
+if __name__ == '__main__':
+    # 解析HTML文件
+    print("正在解析HTML文件...")
+    note_data = parse_html_file('1.html')
+
+    if note_data:
+        # 保存为JSON文件
+        with open('note_data.json', 'w', encoding='utf-8') as f:
+            json.dump(note_data, f, ensure_ascii=False, indent=2)
+        print("数据已保存到 note_data.json")
+        print()
+
+        # 打印数据（尝试处理编码问题）
+        try:
+            print_note_data(note_data)
+        except Exception as e:
+            print(f"打印数据时出错（但数据已保存到JSON文件）: {e}")
+            print("请查看 note_data.json 文件获取完整数据")
+    else:
+        print("解析失败，请检查HTML文件")