mirror of
https://github.com/omg-xtao/ytdlbot.git
synced 2024-11-16 11:52:14 +00:00
better get canonical
This commit is contained in:
parent
6528a45e81
commit
d6fff5033b
@ -12,7 +12,6 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import tempfile
|
|
||||||
import time
|
import time
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
@ -107,12 +106,20 @@ class VIP(Redis, MySQL):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_canonical_link(url):
|
def extract_canonical_link(url):
|
||||||
# canonic link works for many websites. It will strip out unnecessary stuff
|
# canonic link works for many websites. It will strip out unnecessary stuff
|
||||||
try:
|
props = ["canonical", "alternate", "shortlinkUrl"]
|
||||||
html_doc = requests.get(url).text
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
|
||||||
|
html_doc = requests.get(url, headers=headers, timeout=5).text
|
||||||
soup = BeautifulSoup(html_doc, "html.parser")
|
soup = BeautifulSoup(html_doc, "html.parser")
|
||||||
element = soup.find("link", rel="canonical")
|
for prop in props:
|
||||||
return element['href']
|
element = soup.find("link", rel=prop)
|
||||||
|
try:
|
||||||
|
href = element["href"]
|
||||||
|
if href not in ["null", "", None]:
|
||||||
|
return href
|
||||||
except Exception:
|
except Exception:
|
||||||
|
logging.warning("Canonical exception %s soup.find(%s, %s=%s) --> %s", url, tag, kw, v, result)
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def get_channel_info(self, url: "str"):
|
def get_channel_info(self, url: "str"):
|
||||||
@ -289,5 +296,5 @@ def subscribe_query():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
a = VIP.extract_canonical_link("https://youtu.be/FUACKXI-1BA?t=71")
|
a = VIP.extract_canonical_link("https://www.youtube.com/shorts/YrnvPPGznXM")
|
||||||
print(a)
|
print(a)
|
||||||
|
Loading…
Reference in New Issue
Block a user