refactor: uses clearurls to remove params
This commit is contained in:
parent
3e56ad43a0
commit
4388c3fc21
5 changed files with 3214 additions and 86 deletions
|
|
@ -22,8 +22,8 @@ BOT_TOKEN="12345678:<your token>" uv run main.py
|
|||
我想做 matrix bot,不过这两天先不做了
|
||||
|
||||
# 特别感谢
|
||||
- ➗ Actually Legitimate URL Shortener Tool 规则提供了一些链接清理的特性,他们的许可证在这里:
|
||||
https://github.com/DandelionSprout/adfilt/blob/master/LICENSE.md
|
||||
- 链接清理的特性使用了 ClearURLs 插件提供的规则,他们的许可证在这里:
|
||||
https://github.com/ClearURLs/Rules/blob/master/LICENSE
|
||||
|
||||
# 许可证
|
||||
GNU General Public License v3.0
|
||||
3170
assets/clearurls.json
Normal file
3170
assets/clearurls.json
Normal file
File diff suppressed because it is too large
Load diff
117
core/link.py
117
core/link.py
|
|
@ -5,8 +5,8 @@ import re
|
|||
import html
|
||||
import asyncio
|
||||
|
||||
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
||||
from abp.filters import parse_filterlist
|
||||
from urllib.parse import urlparse, parse_qsl, parse_qs, urlencode, urlunparse
|
||||
|
||||
from aiogram.types import Message
|
||||
|
||||
from config import config
|
||||
|
|
@ -21,34 +21,10 @@ has_self_redirection_links = ['www.cnbeta.com.tw','m.cnbeta.com.tw','www.landian
|
|||
|
||||
has_better_alternative_links = ['www.iesdouyin.com','bilibili.com', 'm.bilibili.com', 'youtu.be','m.youtube.com','x.com', 'twitter.com']
|
||||
|
||||
def matches_adb_selector(url, selector):
|
||||
"""Check if URL matches the given selector"""
|
||||
if selector['type'] == 'url-pattern':
|
||||
pattern = selector['value']
|
||||
# Convert AdBlock pattern to regex
|
||||
# ||domain/* becomes ^https?://[^/]*domain.*
|
||||
# domain/* becomes .*domain.*
|
||||
if pattern.startswith('||'):
|
||||
domain_pattern = pattern[2:]
|
||||
# Escape special regex chars except * which we'll convert to .*
|
||||
domain_pattern = re.escape(domain_pattern).replace(r'\*', '.*')
|
||||
regex_pattern = f"^https?://[^/]*{domain_pattern}"
|
||||
else:
|
||||
# Escape special regex chars except * which we'll convert to .*
|
||||
regex_pattern = re.escape(pattern).replace(r'\*', '.*')
|
||||
|
||||
return bool(re.search(regex_pattern, url))
|
||||
return False
|
||||
|
||||
def should_remove_param(url, filter_rule):
|
||||
"""Check if parameter should be removed based on filter rule"""
|
||||
if filter_rule.action == 'allow':
|
||||
return False # Allowlist rules prevent removal
|
||||
|
||||
if filter_rule.selector:
|
||||
return matches_adb_selector(url, filter_rule.selector)
|
||||
|
||||
return True # No selector means apply to all URLs
|
||||
# Load ClearURLs rules from JSON file
|
||||
with open('assets/clearurls.json', 'r', encoding='utf-8') as f:
|
||||
import json
|
||||
clearurls_rules = json.load(f)
|
||||
|
||||
async def extend_short_urls(url):
|
||||
""" 扩展短链接 """
|
||||
|
|
@ -104,56 +80,47 @@ def extract_tb_url_from_html(html_content):
|
|||
return None
|
||||
|
||||
|
||||
def remove_tracking_params(url):
|
||||
""" 移除跟踪参数 """
|
||||
parsed_url = urlparse(url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
# Assume clearurls_rules is a dict loaded from the JSON
|
||||
def remove_tracking_params(url, rules):
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
path = parsed.path
|
||||
matched_rule = None
|
||||
|
||||
# Modified tracking_params collection
|
||||
tracking_rules = []
|
||||
# Find matching rule by urlPattern
|
||||
for site, rule in rules['providers'].items():
|
||||
if re.match(rule['urlPattern'], url):
|
||||
matched_rule = rule
|
||||
break
|
||||
|
||||
with open('assets/LegitimateURLShortener.txt', 'r', encoding='utf-8') as f:
|
||||
for line in parse_filterlist(f):
|
||||
if hasattr(line, 'options') and line.options:
|
||||
for option in line.options:
|
||||
if option[0] == 'removeparam':
|
||||
tracking_rules.append(line)
|
||||
break # Only add rule once even if multiple removeparam options
|
||||
if not matched_rule or not matched_rule['rules']:
|
||||
return url
|
||||
|
||||
for rule in tracking_rules:
|
||||
if not should_remove_param(url, rule):
|
||||
continue
|
||||
# Remove tracking params
|
||||
query = parse_qsl(parsed.query, keep_blank_values=False)
|
||||
filtered_query = [
|
||||
(k, v) for k, v in query
|
||||
if not any(re.fullmatch(param, k) for param in matched_rule['rules'])
|
||||
]
|
||||
|
||||
for option in rule.options or []:
|
||||
if option[0] == 'removeparam':
|
||||
param_pattern = option[1]
|
||||
|
||||
if param_pattern is True: # Remove all params
|
||||
query_params.clear()
|
||||
break
|
||||
elif isinstance(param_pattern, str):
|
||||
# Handle regex patterns
|
||||
if param_pattern.startswith('/') and param_pattern.endswith('/'):
|
||||
regex_pattern = param_pattern[1:-1]
|
||||
params_to_remove = [
|
||||
param for param in query_params.keys()
|
||||
if re.search(regex_pattern, param)
|
||||
]
|
||||
else:
|
||||
# Exact match
|
||||
params_to_remove = [param_pattern] if param_pattern in query_params else []
|
||||
|
||||
for param in params_to_remove:
|
||||
query_params.pop(param, None)
|
||||
# Remove UTM parameters
|
||||
utm_params = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content']
|
||||
for param in utm_params:
|
||||
if param in query_params:
|
||||
query_params.pop(param, None)
|
||||
if param in filtered_query:
|
||||
filtered_query.pop(param, None)
|
||||
|
||||
# Reconstruct URL
|
||||
new_query = urlencode(query_params, doseq=True)
|
||||
return urlunparse(parsed_url._replace(query=new_query))
|
||||
new_query = urlencode(filtered_query)
|
||||
|
||||
cleaned_url = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc,
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
new_query,
|
||||
parsed.fragment
|
||||
))
|
||||
|
||||
return cleaned_url
|
||||
|
||||
def reserve_whitelisted_params(url):
|
||||
""" 保留白名单中的参数 """
|
||||
|
|
@ -270,7 +237,7 @@ async def process_url(url):
|
|||
# 链接没有变化,直接返回 None,避免重复处理
|
||||
return None
|
||||
# 对于其它的网站,首先清理跟踪参数
|
||||
cleaned_url = remove_tracking_params(url)
|
||||
cleaned_url = remove_tracking_params(url, clearurls_rules)
|
||||
# 扩展短链接
|
||||
extended_url = await extend_short_urls(cleaned_url)
|
||||
if urlparse(extended_url).hostname in ['chatglm.cn']:
|
||||
|
|
@ -288,11 +255,11 @@ async def process_url(url):
|
|||
# 链接没有变化,直接返回 None,避免重复处理
|
||||
return None
|
||||
if urlparse(extended_url).hostname in has_better_alternative_links:
|
||||
removed_tracking_url = remove_tracking_params(extended_url)
|
||||
removed_tracking_url = remove_tracking_params(extended_url, clearurls_rules)
|
||||
final_url = transform_into_fixed_url(removed_tracking_url)
|
||||
else:
|
||||
# 对于其他链接,直接对其进行跟踪参数清理
|
||||
final_url = remove_tracking_params(extended_url)
|
||||
final_url = remove_tracking_params(extended_url, clearurls_rules)
|
||||
if url != final_url:
|
||||
return final_url
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ dependencies = [
|
|||
"matrix-nio==0.25.2",
|
||||
"mcstatus==12.0.2",
|
||||
"pangu==4.0.6.1",
|
||||
"python-abp==0.2.0",
|
||||
"pyyaml>=6.0.2",
|
||||
"requests>=2.32.4",
|
||||
]
|
||||
|
|
|
|||
8
uv.lock
generated
8
uv.lock
generated
|
|
@ -630,12 +630,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-abp"
|
||||
version = "0.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/3b/30/31c6e4ca48992ee5f4bb8325f249f944ac493898606ca83a7642ff5ee18b/python-abp-0.2.0.tar.gz", hash = "sha256:f36d0e9fdc089587c26036e0403f36d729395fc9f4dbce45baf3a493d1de8112", size = 80013, upload-time = "2020-05-20T13:09:55.536Z" }
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
|
|
@ -706,7 +700,6 @@ dependencies = [
|
|||
{ name = "matrix-nio" },
|
||||
{ name = "mcstatus" },
|
||||
{ name = "pangu" },
|
||||
{ name = "python-abp" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "requests" },
|
||||
]
|
||||
|
|
@ -722,7 +715,6 @@ requires-dist = [
|
|||
{ name = "matrix-nio", specifier = "==0.25.2" },
|
||||
{ name = "mcstatus", specifier = "==12.0.2" },
|
||||
{ name = "pangu", specifier = "==4.0.6.1" },
|
||||
{ name = "python-abp", specifier = "==0.2.0" },
|
||||
{ name = "pyyaml", specifier = ">=6.0.2" },
|
||||
{ name = "requests", specifier = ">=2.32.4" },
|
||||
]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue