📚 修复 Google 搜索无法使用的问题

2020-07-24 20:04:29 +08:00 · 2020-07-24 20:04:29 +08:00 · 7e66443e38
commit 7e66443e38
parent aa2fcc773d
1 changed files with 27 additions and 101 deletions
--- a/pagermaid/modules/external.py
+++ b/pagermaid/modules/external.py
@ -2,15 +2,12 @@
 from googletrans import Translator, LANGUAGES
 from os import remove
-from urllib import request, parse
+from requests import get
 from math import ceil
 from time import sleep
 from threading import Thread
 from bs4 import BeautifulSoup
 from gtts import gTTS
 from re import compile as regex_compile
 from re import search, sub
 from collections import deque
 from pagermaid import log
 from pagermaid.listener import listener, config
 from pagermaid.utils import clear_emojis, attach_log, fetch_youtube_audio
@ -101,31 +98,40 @@ async def tts(context):
        await context.delete()
-@listener(outgoing=True, command="google",
+@listener(outgoing=True, command="googletest",
          description="使用 Google 查询",
          parameters="<query>")
-async def google(context):
+async def googletest(context):
    """ Searches Google for a string. """
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    headers = {"user-agent": USER_AGENT}
    if context.arguments == "":
        await context.edit("出错了呜呜呜 ~ 无效的参数。")
        return
    query = context.arguments
    query = query.replace(' ', '+')
    URL = f"https://google.com/search?q={query}"
    await context.edit("正在拉取结果 . . .")
-    search_results = GoogleSearch().search(query=query)
+    resp = get(URL, headers=headers)
-    results = ""
+    if resp.status_code == 200:
-    count = 0
+        soup = BeautifulSoup(resp.content, "html.parser")
-    for result in search_results.results:
+        results = ""
-        if count == int(config['result_length']):
+        count = 0
-            break
+        for g in soup.find_all('div', class_='r'):
-        count += 1
+            if count == int(config['result_length']):
-        title = result.title
+                break
-        link = result.url
+            count += 1
-        desc = result.text
+            anchors = g.find_all('a')
-        results += f"\n[{title}]({link}) \n`{desc}`\n"
+            if anchors:
-    await context.edit(f"**Google** |`{query}`| 🎙 🔍 \n"
+                title = g.find('h3').text
-                       f"{results}",
+                link = anchors[0]['href']
-                       link_preview=False)
+                results += f"\n[{title}]({link}) \n"
-    await log(f"在Google搜索引擎上查询了 `{query}`")
+        await context.edit(f"**Google** |`{query}`| 🎙 🔍 \n"
                           f"{results}",
                           link_preview=False)
        await log(f"在Google搜索引擎上查询了 `{query}`")
    else:
        await context.edit("连接到 google服务器 失败")
@listener(outgoing=True, command="fetchaudio",
@ -156,83 +162,3 @@ async def fetchaudio(context):
            await context.edit("出错了呜呜呜 ~ 原声带下载失败。")
        await log(f"从链接中获取了一条音频，链接： {url}.")
        await context.delete()
 class GoogleSearch:
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0"
    SEARCH_URL = "https://google.com/search"
    RESULT_SELECTOR = "div.r > a"
    TOTAL_SELECTOR = "#resultStats"
    RESULTS_PER_PAGE = 10
    DEFAULT_HEADERS = [
        ('User-Agent', USER_AGENT),
        ("Accept-Language", "en-US,en;q=0.5"),
    ]
    def search(self, query, num_results=10, prefetch_pages=True, prefetch_threads=10):
        search_results = []
        pages = int(ceil(num_results / float(GoogleSearch.RESULTS_PER_PAGE)))
        fetcher_threads = deque([])
        total = None
        for i in range(pages):
            start = i * GoogleSearch.RESULTS_PER_PAGE
            opener = request.build_opener()
            opener.addheaders = GoogleSearch.DEFAULT_HEADERS
            response = opener.open(GoogleSearch.SEARCH_URL + "?q=" + parse.quote(query) + ("" if start == 0 else (
                    "&start=" + str(start))))
            soup = BeautifulSoup(response.read(), "lxml")
            response.close()
            if total is None:
                total_text = soup.select(GoogleSearch.TOTAL_SELECTOR)[0].children.__next__()
                total = int(sub("[', ]", "", search("(([0-9]+[', ])*[0-9]+)", total_text).group(1)))
            results = self.parse_results(soup.select(GoogleSearch.RESULT_SELECTOR))
            if len(search_results) + len(results) > num_results:
                del results[num_results - len(search_results):]
            search_results += results
            if prefetch_pages:
                for result in results:
                    while True:
                        running = 0
                        for thread in fetcher_threads:
                            if thread.is_alive():
                                running += 1
                        if running < prefetch_threads:
                            break
                        sleep(1)
                    fetcher_thread = Thread(target=result.get_text)
                    fetcher_thread.start()
                    fetcher_threads.append(fetcher_thread)
        for thread in fetcher_threads:
            thread.join()
        return SearchResponse(search_results, total)
    @staticmethod
    def parse_results(results):
        search_results = []
        for result in results:
            url = result["href"]
            title = result.find_all('h3')[0].text
            text = result.parent.parent.find_all('div', {'class': 's'})[0].text
            search_results.append(SearchResult(title, url, text))
        return search_results
 class SearchResponse:
    def __init__(self, results, total):
        self.results = results
        self.total = total
 class SearchResult:
    def __init__(self, title, url, text):
        self.title = title
        self.url = url
        self.text = text
    def get_text(self):
        return self.text
    def __str__(self):
        return str(self.__dict__)
    def __repr__(self):
        return self.__str__()