This commit is contained in:
27942
2026-01-14 18:30:41 +08:00
commit 4f6e1f9175
1087 changed files with 4427 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
.idea
.git
.log
.venv
__pycache__

BIN
0.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 174 KiB

600
1.html Normal file

File diff suppressed because one or more lines are too long

BIN
1.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

BIN
2.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 261 KiB

BIN
3.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 476 KiB

BIN
4.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 293 KiB

BIN
5.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 578 KiB

BIN
6.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 307 KiB

BIN
7.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 368 KiB

BIN
8.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 236 KiB

BIN
downloaded_image.webp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 185 KiB

48
main.py Normal file
View File

@@ -0,0 +1,48 @@
import requests
cookies = {
'abRequestId': '343dbd09-707e-59a3-9315-3b6fa1c3ff34',
'webBuild': '5.7.0',
'xsecappid': 'xhs-pc-web',
'a1': '19bbae7730ahxzrs5lm6s46vde8fq350g5klg6uij50000427550',
'webId': '17c9fe1d1bc556b2837a35fb01b770b5',
'gid': 'yjDD0dWYyWk8yjDD0dWWq2K780xCuFM216KMT3V4KhfdY128iAq28k8884JW2288YJWqyYjD',
'web_session': '0400698f1bdf69567ccb126e523b4ba45d4326',
'id_token': 'VjEAANiXRALq8n+D/Uh5zBxUZgZQea2cBzD/+4ZtKQHrx2FPtJYJfV+n5N7LJDdNZmVmMugQNdlm0mg6Dy78u0wHOnF2RDB4ZB7i2whxyVT8v97Yrbwz4hbQM3EtVEyNMgzIvZnR',
'websectiga': '29098a4cf41f76ee3f8db19051aaa60c0fc7c5e305572fec762da32d457d76ae',
'sec_poison_id': '36102520-3a09-4447-a36f-a1c31b15b950',
'acw_tc': '0a00d49317683703864606878eb20c852cedbeb990c4ffae205ebc083fcc93',
'loadts': '1768370387610',
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'no-cache',
'dnt': '1',
'pragma': 'no-cache',
'priority': 'u=0, i',
'sec-ch-ua': '"Microsoft Edge";v="143", "Chromium";v="143", "Not A(Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
# 'cookie': 'abRequestId=343dbd09-707e-59a3-9315-3b6fa1c3ff34; webBuild=5.7.0; xsecappid=xhs-pc-web; a1=19bbae7730ahxzrs5lm6s46vde8fq350g5klg6uij50000427550; webId=17c9fe1d1bc556b2837a35fb01b770b5; gid=yjDD0dWYyWk8yjDD0dWWq2K780xCuFM216KMT3V4KhfdY128iAq28k8884JW2288YJWqyYjD; web_session=0400698f1bdf69567ccb126e523b4ba45d4326; id_token=VjEAANiXRALq8n+D/Uh5zBxUZgZQea2cBzD/+4ZtKQHrx2FPtJYJfV+n5N7LJDdNZmVmMugQNdlm0mg6Dy78u0wHOnF2RDB4ZB7i2whxyVT8v97Yrbwz4hbQM3EtVEyNMgzIvZnR; websectiga=29098a4cf41f76ee3f8db19051aaa60c0fc7c5e305572fec762da32d457d76ae; sec_poison_id=36102520-3a09-4447-a36f-a1c31b15b950; acw_tc=0a00d49317683703864606878eb20c852cedbeb990c4ffae205ebc083fcc93; loadts=1768370387610',
}
response = requests.get(
'https://www.xiaohongshu.com/explore/68eab3870000000004004112?xsec_token=ABhDIDWci_QiY0PVUtlkViv01hH_8c3AiHfagOPVFNVms=&xsec_source=pc_search&source=unknown',
# cookies=cookies,
headers=headers,
)
# 保存HTML到文件
with open('1.html', 'w', encoding='utf-8') as f:
f.write(response.text)
print("HTML已保存到 1.html")

140
note_data.json Normal file
View File

@@ -0,0 +1,140 @@
{
"title": "外版三星ZFold大折叠小红书优化排版/显示",
"desc": "不用再羡慕三星国行ZFold的小红书定制客户端和多指捏放多列排版了\n‼一招解决使用网页版快捷方式刷小红书排版合理功能简洁客户端不再臃肿。\n\t\n使用方法p7-p9\n1. Chrome浏览器打开网页版小红书地址在p7。其他浏览器也有类似功能\n2. 浏览器右上角三个点 - “请求访问桌面版” - 然后再 “添加快捷方式到桌面” - 完成!\n\t\n#你会选择什么版本的三星手机? [投票]#\n\t\n#折叠手机的各种形态[话题]# #折叠屏[话题]# #外版[话题]# #美版[话题]# #ZFold7[话题]# #三星zfold6[话题]# #W26[话题]# #心系天下[话题]# #一人一个安卓使用技巧[话题]# #一起聊数码[话题]#",
"images": [
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/33e402eeb9c5ff920f17cb2629d8416f/1040g2sg31ngs7s0a5mdg4a37v9gupc36s0j93a8!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/139f36d7c4dfd3e2976bb90261a63acd/1040g2sg31ngs7s0a5mdg4a37v9gupc36s0j93a8!nd_prv_wlteh_webp_3",
"width": 1200,
"height": 1600
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/98a6c7d45537a5f2add622173cf24a84/1040g2sg31ngs7s0a5mbg4a37v9gupc36cg22vc0!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/8f43add92ab25a5461b60780e796a674/1040g2sg31ngs7s0a5mbg4a37v9gupc36cg22vc0!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/3d639a88face3b18148057ef197c47f8/1040g2sg31ngs7s0a5m7g4a37v9gupc36j1chdq0!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/84d8fcdb12cf658b5d8dc9d50aec40ed/1040g2sg31ngs7s0a5m7g4a37v9gupc36j1chdq0!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/12b653a8a4e91a7cf3277be9193a42bc/1040g2sg31ngs7s0a5md04a37v9gupc36f13sjlo!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/80f540a14744b03dcf4fb840a628cc1a/1040g2sg31ngs7s0a5md04a37v9gupc36f13sjlo!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/0ea1907a54e0ce8471aaaad44da0d4f2/1040g2sg31ngs7s0a5m9g4a37v9gupc36qqcttf8!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/fb8e92fffb4724431c4e4a24c9933827/1040g2sg31ngs7s0a5m9g4a37v9gupc36qqcttf8!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/302ab15194067c95bae700530d02fde1/1040g2sg31ngs7s0a5mb04a37v9gupc36rhqs4i0!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/d250ded63f436abfd99719eae7237bd6/1040g2sg31ngs7s0a5mb04a37v9gupc36rhqs4i0!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/599f42d37a2b4eb59c414f777a81f98c/1040g2sg31ngs7s0a5mc04a37v9gupc36ah2nce0!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/b256afa10893b0a619199201263805d3/1040g2sg31ngs7s0a5mc04a37v9gupc36ah2nce0!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/e91cc06969ab855a419970694c0133ab/1040g2sg31ngs7s0a5ma04a37v9gupc36snnfjj0!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/d6c2010d73dac749f33a086aad6807f4/1040g2sg31ngs7s0a5ma04a37v9gupc36snnfjj0!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
},
{
"url": "http://sns-webpic-qc.xhscdn.com/202601141733/93d81ee1819745b2b50b7a65e33c461d/1040g2sg31ngs7s0a5mag4a37v9gupc36nbskn20!nd_dft_wlteh_webp_3",
"urlPre": "http://sns-webpic-qc.xhscdn.com/202601141733/e2333c5a4abbbedeabc615fdfd7a1f8d/1040g2sg31ngs7s0a5mag4a37v9gupc36nbskn20!nd_prv_wlteh_webp_3",
"width": 1968,
"height": 2184
}
],
"videos": [],
"topics": [
{
"name": "折叠手机的各种形态",
"id": "64dd7b62000000000f00d00a"
},
{
"name": "折叠屏",
"id": "5cac7179000000000d014ce6"
},
{
"name": "外版",
"id": "613454d90000000001006368"
},
{
"name": "美版",
"id": "5fa0c960000000000100b74a"
},
{
"name": "ZFold7",
"id": "681b7312000000001703ad7e"
},
{
"name": "三星zfold6",
"id": "6649bb29000000001b03fc01"
},
{
"name": "W26",
"id": "6425446d000000000d03c650"
},
{
"name": "心系天下",
"id": "60842235000000000101e832"
},
{
"name": "一人一个安卓使用技巧",
"id": "5c72852c000000000f01a6c4"
},
{
"name": "一起聊数码",
"id": "62bd8992000000000101c0c6"
},
{
"name": "你会选择什么版本的三星手机? ",
"id": "137617844749618429"
},
{
"name": "你会选择什么版本的三星手机? [投票]"
},
{
"name": "折叠手机的各种形态[话题]"
},
{
"name": "折叠屏[话题]"
},
{
"name": "外版[话题]"
},
{
"name": "美版[话题]"
},
{
"name": "ZFold7[话题]"
},
{
"name": "三星zfold6[话题]"
},
{
"name": "W26[话题]"
},
{
"name": "心系天下[话题]"
},
{
"name": "一人一个安卓使用技巧[话题]"
},
{
"name": "一起聊数码[话题]"
}
]
}

281
parse_html.py Normal file
View File

@@ -0,0 +1,281 @@
import re
import json
import sys
import io
from bs4 import BeautifulSoup
# 设置标准输出编码为UTF-8避免Windows控制台编码问题
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
def extract_video_from_meta(html_content):
"""
从HTML的meta标签中提取视频信息
Args:
html_content: HTML内容字符串
Returns:
dict: 视频信息字典如果没有找到则返回None
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
video_info = {}
# 提取og:video标签
og_video = soup.find('meta', {'name': 'og:video'})
if og_video and og_video.get('content'):
video_info['url'] = og_video.get('content')
# 提取视频时长
og_videotime = soup.find('meta', {'name': 'og:videotime'})
if og_videotime and og_videotime.get('content'):
video_info['time'] = og_videotime.get('content')
# 提取视频质量
og_videoquality = soup.find('meta', {'name': 'og:videoquality'})
if og_videoquality and og_videoquality.get('content'):
video_info['quality'] = og_videoquality.get('content')
# 如果找到了视频URL返回视频信息
if video_info.get('url'):
return video_info
return None
except Exception as e:
print(f"从meta标签提取视频信息时出错{e}")
return None
def parse_html_file(html_file='1.html'):
"""
解析HTML文件提取标题、描述、图片列表、视频列表和话题
Args:
html_file: HTML文件路径
Returns:
dict: 解析出的笔记数据
"""
try:
# 读取HTML文件
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
# 提取meta标签中的视频信息
video_info = extract_video_from_meta(html_content)
# 使用正则表达式提取window.__INITIAL_STATE__的内容
pattern = r'<script>window\.__INITIAL_STATE__\s*=\s*({.*?});?\s*</script>'
match = re.search(pattern, html_content, re.DOTALL)
if not match:
print("未找到 window.__INITIAL_STATE__ 数据")
# 如果只有视频信息,返回视频信息
if video_info:
return {'videos': [video_info]}
return None
# 提取JSON字符串
json_str = match.group(1)
# 处理JavaScript中的undefined值Python JSON不支持undefined
json_str = re.sub(r'\bundefined\b', 'null', json_str)
# 解析JSON
initial_state = json.loads(json_str)
# 提取笔记数据
note_data = extract_note_data(initial_state)
# 如果提取到视频信息,添加到笔记数据中
if video_info and note_data:
if 'videos' not in note_data or not note_data['videos']:
note_data['videos'] = []
note_data['videos'].append(video_info)
return note_data
except FileNotFoundError:
print(f"错误:找不到文件 {html_file}")
return None
except json.JSONDecodeError as e:
print(f"JSON解析错误{e}")
return None
except Exception as e:
print(f"解析错误:{e}")
return None
def extract_note_data(initial_state):
"""
从初始状态中提取笔记数据(只提取标题、描述、图片列表、视频列表和话题)
Args:
initial_state: window.__INITIAL_STATE__ 解析后的字典
Returns:
dict: 提取的笔记数据
"""
try:
# 获取笔记详情
note_store = initial_state.get('note', {})
note_detail_map = note_store.get('noteDetailMap', {})
# 获取第一个笔记ID
first_note_id = note_store.get('firstNoteId')
if not first_note_id:
# 如果没有firstNoteId尝试获取noteDetailMap中的第一个key
if note_detail_map:
first_note_id = list(note_detail_map.keys())[0]
else:
print("未找到笔记ID")
return None
# 获取笔记详情
note_detail = note_detail_map.get(first_note_id, {})
note_info = note_detail.get('note', {})
if not note_info:
print("未找到笔记信息")
return None
# 只提取需要的字段
extracted_data = {
'title': note_info.get('title'),
'desc': note_info.get('desc'),
'images': [],
'videos': [],
'topics': []
}
# 提取图片信息
image_list = note_info.get('imageList', [])
for img in image_list:
image_data = {
'url': img.get('urlDefault') or img.get('url'),
'urlPre': img.get('urlPre'),
'width': img.get('width'),
'height': img.get('height'),
}
extracted_data['images'].append(image_data)
# 提取视频信息(如果存在)
video_info = note_info.get('video', {})
if video_info:
video_data = {}
# 尝试提取视频URL
media = video_info.get('media', {})
if media:
stream = media.get('stream', {})
if stream:
hls = stream.get('hls', {})
if hls:
video_data['url'] = hls.get('masterUrl') or hls.get('url')
# 如果没有hls尝试其他字段
if not video_data.get('url'):
video_data['url'] = media.get('url') or media.get('videoUrl')
# 提取视频封面
if video_info.get('cover'):
video_data['cover'] = video_info.get('cover')
# 提取视频时长
if video_info.get('time'):
video_data['time'] = video_info.get('time')
if video_data.get('url'):
extracted_data['videos'].append(video_data)
# 提取话题信息
# 话题可能在多个位置,尝试不同的字段名
topic_list = note_info.get('topicList', []) or note_info.get('tagList', []) or note_info.get('hashtagList', [])
if topic_list:
for topic in topic_list:
topic_data = {
'name': topic.get('name') or topic.get('title') or topic.get('tagName'),
'id': topic.get('id') or topic.get('topicId') or topic.get('tagId'),
}
if topic_data.get('name'):
extracted_data['topics'].append(topic_data)
# 如果描述中包含话题(#话题#格式),也提取出来
desc = note_info.get('desc', '')
if desc:
# 使用正则表达式提取 #话题# 格式
topic_pattern = r'#([^#]+)#'
matches = re.findall(topic_pattern, desc)
for match in matches:
# 避免重复添加
if not any(t.get('name') == match for t in extracted_data['topics']):
extracted_data['topics'].append({'name': match})
return extracted_data
except Exception as e:
print(f"提取笔记数据时出错:{e}")
import traceback
traceback.print_exc()
return None
def print_note_data(note_data):
"""
格式化打印笔记数据
Args:
note_data: 笔记数据字典
"""
if not note_data:
print("没有可显示的数据")
return
print("=" * 60)
print("笔记信息")
print("=" * 60)
print(f"标题: {note_data.get('title')}")
print()
print("描述:")
desc = note_data.get('desc', '')
if desc:
print(desc)
print("图片列表:")
images = note_data.get('images', [])
print(f"{len(images)} 张图片")
print(images)
print("视频列表:")
videos = note_data.get('videos', [])
print(f"{len(videos)} 个视频")
print(videos)
print("话题列表:")
topics = note_data.get('topics', [])
print(f"{len(topics)} 个话题")
print(topics)
if __name__ == '__main__':
# 解析HTML文件
print("正在解析HTML文件...")
note_data = parse_html_file('1.html')
if note_data:
# 保存为JSON文件
with open('note_data.json', 'w', encoding='utf-8') as f:
json.dump(note_data, f, ensure_ascii=False, indent=2)
print("数据已保存到 note_data.json")
print()
# 打印数据(尝试处理编码问题)
try:
print_note_data(note_data)
except Exception as e:
print(f"打印数据时出错但数据已保存到JSON文件: {e}")
print("请查看 note_data.json 文件获取完整数据")
else:
print("解析失败请检查HTML文件")

14
test.py Normal file
View File

@@ -0,0 +1,14 @@
from urllib.parse import urlparse
import os
url = "https://sns-video-hw.xhscdn.com/stream/110/258/01e6cd08be6e36ad010370019190eceaac_258.mp4"
# 解析URL
parsed_url = urlparse(url)
# 获取路径部分
path = parsed_url.path
# 从路径中提取文件名
filename = os.path.basename(path)
print(f"完整路径: {path}")
print(f"文件名: {filename}")

Binary file not shown.

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,2 @@
{
}

Binary file not shown.

View File

@@ -0,0 +1,12 @@
{
"epochs": [ {
"calculation_time": "13412845270589830",
"config_version": 0,
"model_version": "0",
"padded_top_topics_start_index": 0,
"taxonomy_version": 0,
"top_topics_and_observing_domains": [ ]
} ],
"hex_encoded_hmac_key": "434BF7DBD7DA573B45E0A11AD9045A61B6221D14AE2F9A341E2FEF659AF071F6",
"next_scheduled_calculation_time": "13413450070589860"
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 266 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 284 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 128 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 617 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Some files were not shown because too many files have changed in this diff Show More