https://github.com/scrapy/w3lib
pip install w3lib
w3lib.url
canonicalize_url(url, keep_fragments=False)
规范化 URLpythonfrom w3lib.url import canonicalize_url
url = "HTTP://www.Example.com/a%20b?b=2&a=1#fragment"
canonical_url = canonicalize_url(url)
print(canonical_url) # 输出: "http://www.example.com/a%20b?a=1&b=2"
add_or_replace_parameter(url, name, new_value)
URL 中添加或替换查询参数pythonfrom w3lib.url import add_or_replace_parameter
url = "http://www.example.com/page?param1=value1"
new_url = add_or_replace_parameter(url, "param2", "value2")
print(new_url) # 输出: "http://www.example.com/page?param1=value1¶m2=value2"
is_url(url)
检查字符串是否为有效的 URLpythonfrom w3lib.url import is_url
url = "http://www.example.com"
print(is_url(url)) # 输出: True
invalid_url = "not_a_url"
print(is_url(invalid_url)) # 输出: False
url_query_cleaner(url, parameterlist=(), remove=False, unique=True)
清理 URL 中的查询参数pythonfrom w3lib.url import url_query_cleaner
url = "http://www.example.com/page?param1=value1¶m2=value2¶m1=value3"
cleaned_url = url_query_cleaner(url, parameterlist=["param1", "param2"])
print(cleaned_url) # 输出: "http://www.example.com/page?param1=value1¶m2=value2"
safe_url_string(url)
将 URL 转换为安全的字符串形式(编码 URL)pythonfrom w3lib.url import safe_url_string
url = "http://www.example.com/文件"
safe_url = safe_url_string(url)
print(safe_url) # 输出: "http://www.example.com/%E6%96%87%E4%BB%B6"
w3lib.html
remove_tags(html, which_ones=())
移除 HTML 字符串中的指定标签pythonfrom w3lib.html import remove_tags
html = "<p>Hello <b>World</b>!</p>"
clean_html = remove_tags(html, which_ones=["b"])
print(clean_html) # 输出: "<p>Hello World!</p>"
replace_tags(html, token, which_ones=())
将指定的 HTML 标签替换为给定的字符串pythonfrom w3lib.html import replace_tags
html = "<p>Hello <b>World</b>!</p>"
replaced_html = replace_tags(html, token="**", which_ones=["b"])
print(replaced_html) # 输出: "<p>Hello **World**!</p>"
strip_html5_whitespace(text)
移除 HTML5 中定义的空白字符pythonfrom w3lib.html import strip_html5_whitespace
text = " Hello World! \n\t"
clean_text = strip_html5_whitespace(text)
print(clean_text) # 输出: "Hello World!"
replace_entities(text)
替换 HTML 实体为对应的字符用于将 &
、<
等 HTML 实体转换为其对应的字符
pythonfrom w3lib.html import replace_entities
text = "Hello & World"
clean_text = replace_entities(text)
print(clean_text) # 输出: "Hello & World"
符号 | 实体名称 | 实体编号 | 描述 |
---|---|---|---|
& | & | & | 和号(Ampersand) |
< | < | < | 小于号(Less Than) |
> | > | > | 大于号(Greater Than) |
" | " | " | 双引号(Quotation Mark) |
' | ' | ' | 单引号(Apostrophe) |
符号 | 实体名称 | 实体编号 | 描述 |
---|---|---|---|
| |   | 不间断空格(Non-Breaking Space) |
|   |   | 短空格(En Space) |
|   |   | 长空格(Em Space) |
|   |   | 细空格(Thin Space) |
符号 | 实体名称 | 实体编号 | 描述 |
---|---|---|---|
+ | + | + | 加号(Plus Sign) |
− | − | − | 减号(Minus Sign) |
× | × | × | 乘号(Multiplication Sign) |
÷ | ÷ | ÷ | 除号(Division Sign) |
± | ± | ± | 正负号(Plus-Minus Sign) |
≠ | ≠ | ≠ | 不等号(Not Equal) |
w3lib.encoding
html_body_declared_encoding(body)
从 HTML <meta>
标签中提取声明的字符编码pythonfrom w3lib.encoding import html_body_declared_encoding
html = '<meta charset="UTF-8"><p>Hello World!</p>'
encoding = html_body_declared_encoding(html)
print(encoding) # 输出: "UTF-8"
w3lib.http
normalize_headers(headers)
将 HTTP 头转换为标准化格式pythonfrom w3lib.http import normalize_headers
headers = {
"Content-Type": "text/html",
"User-Agent": "Mozilla/5.0"
}
normalized_headers = normalize_headers(headers)
print(normalized_headers) # 输出: {'content-type': ['text/html'], 'user-agent': ['Mozilla/5.0']}