haha/parse_html.py

import re
import json
import sys
import io
from bs4 import BeautifulSoup

# 设置标准输出编码为UTF-8，避免Windows控制台编码问题
if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')


def extract_video_from_meta(html_content):
    """
    从HTML的meta标签中提取视频信息

    Args:
        html_content: HTML内容字符串

    Returns:
        dict: 视频信息字典，如果没有找到则返回None
    """
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        video_info = {}

        # 提取og:video标签
        og_video = soup.find('meta', {'name': 'og:video'})
        if og_video and og_video.get('content'):
            video_info['url'] = og_video.get('content')

        # 提取视频时长
        og_videotime = soup.find('meta', {'name': 'og:videotime'})
        if og_videotime and og_videotime.get('content'):
            video_info['time'] = og_videotime.get('content')

        # 提取视频质量
        og_videoquality = soup.find('meta', {'name': 'og:videoquality'})
        if og_videoquality and og_videoquality.get('content'):
            video_info['quality'] = og_videoquality.get('content')

        # 如果找到了视频URL，返回视频信息
        if video_info.get('url'):
            return video_info

        return None
    except Exception as e:
        print(f"从meta标签提取视频信息时出错：{e}")
        return None


def parse_html_file(html_file='1.html'):
    """
    解析HTML文件，提取标题、描述、图片列表、视频列表和话题

    Args:
        html_file: HTML文件路径

    Returns:
        dict: 解析出的笔记数据
    """
    try:
        # 读取HTML文件
        with open(html_file, 'r', encoding='utf-8') as f:
            html_content = f.read()

        # 提取meta标签中的视频信息
        video_info = extract_video_from_meta(html_content)

        # 使用正则表达式提取window.__INITIAL_STATE__的内容
        pattern = r'<script>window\.__INITIAL_STATE__\s*=\s*({.*?});?\s*</script>'
        match = re.search(pattern, html_content, re.DOTALL)

        if not match:
            print("未找到 window.__INITIAL_STATE__ 数据")
            # 如果只有视频信息，返回视频信息
            if video_info:
                return {'videos': [video_info]}
            return None

        # 提取JSON字符串
        json_str = match.group(1)

        # 处理JavaScript中的undefined值（Python JSON不支持undefined）
        json_str = re.sub(r'\bundefined\b', 'null', json_str)

        # 解析JSON
        initial_state = json.loads(json_str)

        # 提取笔记数据
        note_data = extract_note_data(initial_state)

        # 如果提取到视频信息，添加到笔记数据中
        if video_info and note_data:
            if 'videos' not in note_data or not note_data['videos']:
                note_data['videos'] = []
            note_data['videos'].append(video_info)

        return note_data

    except FileNotFoundError:
        print(f"错误：找不到文件 {html_file}")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON解析错误：{e}")
        return None
    except Exception as e:
        print(f"解析错误：{e}")
        return None


def extract_note_data(initial_state):
    """
    从初始状态中提取笔记数据（只提取标题、描述、图片列表、视频列表和话题）

    Args:
        initial_state: window.__INITIAL_STATE__ 解析后的字典

    Returns:
        dict: 提取的笔记数据
    """
    try:
        # 获取笔记详情
        note_store = initial_state.get('note', {})
        note_detail_map = note_store.get('noteDetailMap', {})

        # 获取第一个笔记ID
        first_note_id = note_store.get('firstNoteId')
        if not first_note_id:
            # 如果没有firstNoteId，尝试获取noteDetailMap中的第一个key
            if note_detail_map:
                first_note_id = list(note_detail_map.keys())[0]
            else:
                print("未找到笔记ID")
                return None

        # 获取笔记详情
        note_detail = note_detail_map.get(first_note_id, {})
        note_info = note_detail.get('note', {})

        if not note_info:
            print("未找到笔记信息")
            return None

        # 只提取需要的字段
        extracted_data = {
            'title': note_info.get('title'),
            'desc': note_info.get('desc'),
            'images': [],
            'videos': [],
            'topics': []
        }

        # 提取图片信息
        image_list = note_info.get('imageList', [])
        for img in image_list:
            image_data = {
                'url': img.get('urlDefault') or img.get('url'),
                'urlPre': img.get('urlPre'),
                'width': img.get('width'),
                'height': img.get('height'),
            }
            extracted_data['images'].append(image_data)

        # 提取视频信息（如果存在）
        video_info = note_info.get('video', {})
        if video_info:
            video_data = {}

            # 尝试提取视频URL
            media = video_info.get('media', {})
            if media:
                stream = media.get('stream', {})
                if stream:
                    hls = stream.get('hls', {})
                    if hls:
                        video_data['url'] = hls.get('masterUrl') or hls.get('url')
                # 如果没有hls，尝试其他字段
                if not video_data.get('url'):
                    video_data['url'] = media.get('url') or media.get('videoUrl')

            # 提取视频封面
            if video_info.get('cover'):
                video_data['cover'] = video_info.get('cover')

            # 提取视频时长
            if video_info.get('time'):
                video_data['time'] = video_info.get('time')

            if video_data.get('url'):
                extracted_data['videos'].append(video_data)

        # 提取话题信息
        # 话题可能在多个位置，尝试不同的字段名
        topic_list = note_info.get('topicList', []) or note_info.get('tagList', []) or note_info.get('hashtagList', [])
        if topic_list:
            for topic in topic_list:
                topic_data = {
                    'name': topic.get('name') or topic.get('title') or topic.get('tagName'),
                    'id': topic.get('id') or topic.get('topicId') or topic.get('tagId'),
                }
                if topic_data.get('name'):
                    extracted_data['topics'].append(topic_data)

        # 如果描述中包含话题（#话题#格式），也提取出来
        desc = note_info.get('desc', '')
        if desc:
            # 使用正则表达式提取 #话题# 格式
            topic_pattern = r'#([^#]+)#'
            matches = re.findall(topic_pattern, desc)
            for match in matches:
                # 避免重复添加
                if not any(t.get('name') == match for t in extracted_data['topics']):
                    extracted_data['topics'].append({'name': match})

        return extracted_data

    except Exception as e:
        print(f"提取笔记数据时出错：{e}")
        import traceback
        traceback.print_exc()
        return None


def print_note_data(note_data):
    """
    格式化打印笔记数据

    Args:
        note_data: 笔记数据字典
    """
    if not note_data:
        print("没有可显示的数据")
        return

    print("=" * 60)
    print("笔记信息")
    print("=" * 60)
    print(f"标题: {note_data.get('title')}")
    print()

    print("描述:")
    desc = note_data.get('desc', '')
    if desc:
        print(desc)

    print("图片列表:")
    images = note_data.get('images', [])
    print(f"  共 {len(images)} 张图片")
    print(images)

    print("视频列表:")
    videos = note_data.get('videos', [])
    print(f"  共 {len(videos)} 个视频")
    print(videos)

    print("话题列表:")
    topics = note_data.get('topics', [])
    print(f"  共 {len(topics)} 个话题")
    print(topics)


if __name__ == '__main__':
    # 解析HTML文件
    print("正在解析HTML文件...")
    note_data = parse_html_file('1.html')

    if note_data:
        # 保存为JSON文件
        with open('note_data.json', 'w', encoding='utf-8') as f:
            json.dump(note_data, f, ensure_ascii=False, indent=2)
        print("数据已保存到 note_data.json")
        print()

        # 打印数据（尝试处理编码问题）
        try:
            print_note_data(note_data)
        except Exception as e:
            print(f"打印数据时出错（但数据已保存到JSON文件）: {e}")
            print("请查看 note_data.json 文件获取完整数据")
    else:
        print("解析失败，请检查HTML文件")