refactor: uses clearurls to remove params

This commit is contained in:
草师傅 2025-10-02 12:35:39 +08:00
parent 3e56ad43a0
commit 4388c3fc21
5 changed files with 3214 additions and 86 deletions

View file

@ -22,8 +22,8 @@ BOT_TOKEN="12345678:<your token>" uv run main.py
我想做 matrix bot不过这两天先不做了
# 特别感谢
- ➗ Actually Legitimate URL Shortener Tool 规则提供了一些链接清理的特性,他们的许可证在这里:
https://github.com/DandelionSprout/adfilt/blob/master/LICENSE.md
- 链接清理的特性使用了 ClearURLs 插件提供的规则,他们的许可证在这里:
https://github.com/ClearURLs/Rules/blob/master/LICENSE
# 许可证
GNU General Public License v3.0

3170
assets/clearurls.json Normal file

File diff suppressed because it is too large Load diff

View file

@ -5,8 +5,8 @@ import re
import html
import asyncio
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
from abp.filters import parse_filterlist
from urllib.parse import urlparse, parse_qsl, parse_qs, urlencode, urlunparse
from aiogram.types import Message
from config import config
@ -21,34 +21,10 @@ has_self_redirection_links = ['www.cnbeta.com.tw','m.cnbeta.com.tw','www.landian
has_better_alternative_links = ['www.iesdouyin.com','bilibili.com', 'm.bilibili.com', 'youtu.be','m.youtube.com','x.com', 'twitter.com']
def matches_adb_selector(url, selector):
"""Check if URL matches the given selector"""
if selector['type'] == 'url-pattern':
pattern = selector['value']
# Convert AdBlock pattern to regex
# ||domain/* becomes ^https?://[^/]*domain.*
# domain/* becomes .*domain.*
if pattern.startswith('||'):
domain_pattern = pattern[2:]
# Escape special regex chars except * which we'll convert to .*
domain_pattern = re.escape(domain_pattern).replace(r'\*', '.*')
regex_pattern = f"^https?://[^/]*{domain_pattern}"
else:
# Escape special regex chars except * which we'll convert to .*
regex_pattern = re.escape(pattern).replace(r'\*', '.*')
return bool(re.search(regex_pattern, url))
return False
def should_remove_param(url, filter_rule):
"""Check if parameter should be removed based on filter rule"""
if filter_rule.action == 'allow':
return False # Allowlist rules prevent removal
if filter_rule.selector:
return matches_adb_selector(url, filter_rule.selector)
return True # No selector means apply to all URLs
# Load ClearURLs rules from JSON file
with open('assets/clearurls.json', 'r', encoding='utf-8') as f:
import json
clearurls_rules = json.load(f)
async def extend_short_urls(url):
""" 扩展短链接 """
@ -104,56 +80,47 @@ def extract_tb_url_from_html(html_content):
return None
def remove_tracking_params(url):
""" 移除跟踪参数 """
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Assume clearurls_rules is a dict loaded from the JSON
def remove_tracking_params(url, rules):
parsed = urlparse(url)
domain = parsed.netloc.lower()
path = parsed.path
matched_rule = None
# Modified tracking_params collection
tracking_rules = []
# Find matching rule by urlPattern
for site, rule in rules['providers'].items():
if re.match(rule['urlPattern'], url):
matched_rule = rule
break
with open('assets/LegitimateURLShortener.txt', 'r', encoding='utf-8') as f:
for line in parse_filterlist(f):
if hasattr(line, 'options') and line.options:
for option in line.options:
if option[0] == 'removeparam':
tracking_rules.append(line)
break # Only add rule once even if multiple removeparam options
if not matched_rule or not matched_rule['rules']:
return url
for rule in tracking_rules:
if not should_remove_param(url, rule):
continue
# Remove tracking params
query = parse_qsl(parsed.query, keep_blank_values=False)
filtered_query = [
(k, v) for k, v in query
if not any(re.fullmatch(param, k) for param in matched_rule['rules'])
]
for option in rule.options or []:
if option[0] == 'removeparam':
param_pattern = option[1]
if param_pattern is True: # Remove all params
query_params.clear()
break
elif isinstance(param_pattern, str):
# Handle regex patterns
if param_pattern.startswith('/') and param_pattern.endswith('/'):
regex_pattern = param_pattern[1:-1]
params_to_remove = [
param for param in query_params.keys()
if re.search(regex_pattern, param)
]
else:
# Exact match
params_to_remove = [param_pattern] if param_pattern in query_params else []
for param in params_to_remove:
query_params.pop(param, None)
# Remove UTM parameters
utm_params = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content']
for param in utm_params:
if param in query_params:
query_params.pop(param, None)
if param in filtered_query:
filtered_query.pop(param, None)
# Reconstruct URL
new_query = urlencode(query_params, doseq=True)
return urlunparse(parsed_url._replace(query=new_query))
new_query = urlencode(filtered_query)
cleaned_url = urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
new_query,
parsed.fragment
))
return cleaned_url
def reserve_whitelisted_params(url):
""" 保留白名单中的参数 """
@ -270,7 +237,7 @@ async def process_url(url):
# 链接没有变化,直接返回 None避免重复处理
return None
# 对于其它的网站,首先清理跟踪参数
cleaned_url = remove_tracking_params(url)
cleaned_url = remove_tracking_params(url, clearurls_rules)
# 扩展短链接
extended_url = await extend_short_urls(cleaned_url)
if urlparse(extended_url).hostname in ['chatglm.cn']:
@ -288,11 +255,11 @@ async def process_url(url):
# 链接没有变化,直接返回 None避免重复处理
return None
if urlparse(extended_url).hostname in has_better_alternative_links:
removed_tracking_url = remove_tracking_params(extended_url)
removed_tracking_url = remove_tracking_params(extended_url, clearurls_rules)
final_url = transform_into_fixed_url(removed_tracking_url)
else:
# 对于其他链接,直接对其进行跟踪参数清理
final_url = remove_tracking_params(extended_url)
final_url = remove_tracking_params(extended_url, clearurls_rules)
if url != final_url:
return final_url
return None

View file

@ -13,7 +13,6 @@ dependencies = [
"matrix-nio==0.25.2",
"mcstatus==12.0.2",
"pangu==4.0.6.1",
"python-abp==0.2.0",
"pyyaml>=6.0.2",
"requests>=2.32.4",
]

8
uv.lock generated
View file

@ -630,12 +630,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" },
]
[[package]]
name = "python-abp"
version = "0.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/3b/30/31c6e4ca48992ee5f4bb8325f249f944ac493898606ca83a7642ff5ee18b/python-abp-0.2.0.tar.gz", hash = "sha256:f36d0e9fdc089587c26036e0403f36d729395fc9f4dbce45baf3a493d1de8112", size = 80013, upload-time = "2020-05-20T13:09:55.536Z" }
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@ -706,7 +700,6 @@ dependencies = [
{ name = "matrix-nio" },
{ name = "mcstatus" },
{ name = "pangu" },
{ name = "python-abp" },
{ name = "pyyaml" },
{ name = "requests" },
]
@ -722,7 +715,6 @@ requires-dist = [
{ name = "matrix-nio", specifier = "==0.25.2" },
{ name = "mcstatus", specifier = "==12.0.2" },
{ name = "pangu", specifier = "==4.0.6.1" },
{ name = "python-abp", specifier = "==0.2.0" },
{ name = "pyyaml", specifier = ">=6.0.2" },
{ name = "requests", specifier = ">=2.32.4" },
]