mirror of
https://github.com/omg-xtao/ytdlbot.git
synced 2025-01-30 17:58:36 +00:00
faster extract_canonical_link
This commit is contained in:
parent
30195dba3f
commit
a57d204b11
@ -8,6 +8,7 @@
|
||||
__author__ = "Benny <benny.think@gmail.com>"
|
||||
|
||||
import hashlib
|
||||
import http
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
@ -109,6 +110,13 @@ class VIP(Redis, MySQL):
|
||||
props = ["canonical", "alternate", "shortlinkUrl"]
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
|
||||
# send head request first
|
||||
r = requests.head(url, headers=headers)
|
||||
if r.status_code != http.HTTPStatus.METHOD_NOT_ALLOWED and r.headers.get("content-type") != "text/html":
|
||||
# get content-type, if it's not text/html, there's no need to issue a GET request
|
||||
logging.warning("%s Content-type is not text/html, no need to GET for extract_canonical_link", url)
|
||||
return url
|
||||
|
||||
html_doc = requests.get(url, headers=headers, timeout=5).text
|
||||
soup = BeautifulSoup(html_doc, "html.parser")
|
||||
for prop in props:
|
||||
|
Loading…
Reference in New Issue
Block a user