feat: add support for xhs ,jd , chatglm link cleaning

This commit is contained in:
草师傅 2025-08-02 21:39:22 +08:00
parent 7d52e23522
commit 3fb147bcbe

View file

@ -9,6 +9,9 @@ from aiogram.types import Message
from config import config
whitelist_param_links = ['www.iesdouyin.com','item.taobao.com', 'detail.tmall.com', 'h5.m.goofish.com', 'music.163.com',
'www.bilibili.com', 'm.bilibili.com', 'bilibili.com', 'mall.bilibili.com',
'space.bilibili.com', 'live.bilibili.com','item.m.jd.com','item.jd.com','www.xiaohongshu.com']
def matches_adb_selector(url, selector):
"""Check if URL matches the given selector"""
@ -56,6 +59,16 @@ async def extend_short_urls(url):
# 如果 Location 头部没有 http 前缀,可能是相对路径
# 需要将其转换正确的链接
return urlparse(url)._replace(path=r.headers['Location']).geturl()
elif not r.status in [200,403,404,502,503]:
# 对于一些需要“正常”浏览器才能访问的链接,尝试修复
async with session.get(url, allow_redirects=False, headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.7103.48 Safari/537.36'}) as r_fix:
if r_fix.status in [301, 302, 304, 307, 308] and 'Location' in r_fix.headers:
if r_fix.headers['Location'].startswith(('http://', 'https://')):
return r_fix.headers['Location']
else:
# 如果 Location 头部没有 http 前缀,可能是相对路径
# 需要将其转换正确的链接
return urlparse(url)._replace(path=r_fix.headers['Location']).geturl()
return url
def extract_tb_url_from_html(html_content):
@ -129,12 +142,27 @@ def reserve_whitelisted_params(url):
# 重新构建URL
cleaned_query = urlencode(new_query_params, doseq=True)
return urlunparse(parsed_url._replace(query=cleaned_query))
elif parsed_url.hostname in ['www.iesdouyin.com','www.bilibili.com','m.bilibili.com','bilibili.com','mall.bilibili.com','space.bilibili.com','live.bilibili.com']:
if 'music.163.com' in parsed_url.hostname and 'id' not in query_params:
# 如果网易云链接没有id参数不保留任何参数
# 例如 https://music.163.com/song/12345678
new_query_params = {}
cleaned_query = urlencode(new_query_params, doseq=True)
return urlunparse(parsed_url._replace(query=cleaned_query))
elif parsed_url.hostname in ['www.iesdouyin.com','www.bilibili.com','m.bilibili.com','bilibili.com','mall.bilibili.com','space.bilibili.com','live.bilibili.com','item.m.jd.com','item.jd.com','www.xiaohongshu.com']:
# 不保留任何参数
new_query_params = {}
if 'xiaohongshu.com' in parsed_url.hostname and 'xsec_token' in query_params:
# 为了保证能正常访问,小红书链接保留 xsec_token 参数
# 我是不是也应该 f**k 小红书一下
new_query_params = {'xsec_token': query_params['xsec_token']}
# 重新构建URL
cleaned_query = urlencode(new_query_params, doseq=True)
return urlunparse(parsed_url._replace(query=cleaned_query))
elif parsed_url.hostname in ['chatglm.cn'] and query_params:
# 就你叫智谱啊
new_query_params = {'share_conversation_id': query_params['share_conversation_id']}
cleaned_query = urlencode(new_query_params, doseq=True)
return urlunparse(parsed_url._replace(query=cleaned_query))
return url
def transform_into_fixed_url(url):
@ -154,9 +182,7 @@ def transform_into_fixed_url(url):
async def process_url(url):
# 对于适配的网站,直接保留白名单参数并返回
if urlparse(url).hostname in ['www.iesdouyin.com','item.taobao.com', 'detail.tmall.com', 'h5.m.goofish.com', 'music.163.com',
'www.bilibili.com', 'm.bilibili.com', 'bilibili.com', 'mall.bilibili.com',
'space.bilibili.com', 'live.bilibili.com']:
if urlparse(url).hostname in whitelist_param_links:
final_url = reserve_whitelisted_params(url)
if urlparse(final_url).hostname in ['www.iesdouyin.com','bilibili.com', 'm.bilibili.com']:
final_url = transform_into_fixed_url(final_url)
@ -165,10 +191,11 @@ async def process_url(url):
cleaned_url = remove_tracking_params(url)
# 扩展短链接
extended_url = await extend_short_urls(cleaned_url)
if urlparse(extended_url).hostname in ['chatglm.cn']:
final_url = reserve_whitelisted_params(extended_url)
return final_url
# 对于扩展短链接之后的适配的网站,直接保留白名单参数并返回
if urlparse(extended_url).hostname in ['www.iesdouyin.com','item.taobao.com', 'detail.tmall.com', 'h5.m.goofish.com', 'music.163.com',
'www.bilibili.com', 'm.bilibili.com', 'bilibili.com', 'mall.bilibili.com',
'space.bilibili.com', 'live.bilibili.com']:
if urlparse(extended_url).hostname in whitelist_param_links:
final_url = reserve_whitelisted_params(extended_url)
if urlparse(final_url).hostname in ['www.iesdouyin.com','bilibili.com', 'm.bilibili.com']:
final_url = transform_into_fixed_url(final_url)
@ -204,4 +231,5 @@ async def handle_links(message: Message):
if final_urls:
await message.reply(f"{"\n".join(final_urls)}\n消息里有包含跟踪参数的链接,已经帮你转换了哦~\n\n注意:"
f"这个功能是试验性的,可能会出现链接无法访问等问题,如果出现链接没有清理干净的情况,"
f"可以将返回的结果再次发送给bot或者尝试手动清理。\n如果我没有回复链接,说明链接不需要被清理\n如果你找到了这个工具的问题,欢迎把它通过 `/report_broken_links 链接 需要去除的参数等等` 报告给开发者!")
f"可以将返回的结果再次发送给bot或者尝试手动清理。\n如果你找到了这个工具的问题,欢迎"
f"把它通过 `/report_broken_links 链接 需要去除的参数等等` 报告给开发者!")