faster extract_canonical_link

This commit is contained in:
BennyThink 2022-02-27 10:49:04 +08:00
parent 30195dba3f
commit a57d204b11
No known key found for this signature in database
GPG Key ID: 6CD0DBDA5235D481

View File

@ -8,6 +8,7 @@
__author__ = "Benny <benny.think@gmail.com>"
import hashlib
import http
import logging
import math
import os
@ -109,6 +110,13 @@ class VIP(Redis, MySQL):
props = ["canonical", "alternate", "shortlinkUrl"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
# send head request first
r = requests.head(url, headers=headers)
if r.status_code != http.HTTPStatus.METHOD_NOT_ALLOWED and r.headers.get("content-type") != "text/html":
# get content-type, if it's not text/html, there's no need to issue a GET request
logging.warning("%s Content-type is not text/html, no need to GET for extract_canonical_link", url)
return url
html_doc = requests.get(url, headers=headers, timeout=5).text
soup = BeautifulSoup(html_doc, "html.parser")
for prop in props: