🐛 Fix semaphore synchronization issue in WikiModel async generator

In `WikiModel.full_data_generator`, the semaphore wasn't being decremented appropriately when the scraping task faced an exception, potentially causing threads to wait indefinitely. Exception handling has now been added to the `task` function to ensure the semaphore is decremented correctly, regardless of task outcome.
This commit is contained in:
luoshuijs 2023-10-08 18:52:23 +08:00 committed by GitHub
parent 553c4fb805
commit ec42cde7b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 10 deletions

View File

@ -11,7 +11,8 @@ from bs4 import BeautifulSoup
from httpx import URL, AsyncClient, HTTPError, Response
from pydantic import BaseConfig as PydanticBaseConfig
from pydantic import BaseModel as PydanticBaseModel
from typing_extensions import Self
from utils.log import logger
try:
import ujson as jsonlib
@ -90,7 +91,7 @@ class WikiModel(Model):
@classmethod
@abstractmethod
async def _parse_soup(cls, soup: BeautifulSoup) -> Self:
async def _parse_soup(cls, soup: BeautifulSoup) -> "WikiModel":
"""解析 soup 生成对应 WikiModel
Args:
@ -100,7 +101,7 @@ class WikiModel(Model):
"""
@classmethod
async def _scrape(cls, url: Union[URL, str]) -> Self:
async def _scrape(cls, url: Union[URL, str]) -> "WikiModel":
"""从 url 中爬取数据,并返回对应的 Model
Args:
@ -112,7 +113,7 @@ class WikiModel(Model):
return await cls._parse_soup(BeautifulSoup(response.text, "lxml"))
@classmethod
async def get_by_id(cls, id_: str) -> Self:
async def get_by_id(cls, id_: str) -> "WikiModel":
"""通过ID获取Model
Args:
@ -123,7 +124,7 @@ class WikiModel(Model):
return await cls._scrape(await cls.get_url_by_id(id_))
@classmethod
async def get_by_name(cls, name: str) -> Optional[Self]:
async def get_by_name(cls, name: str) -> Optional["WikiModel"]:
"""通过名称获取Model
Args:
@ -135,7 +136,7 @@ class WikiModel(Model):
return None if url is None else await cls._scrape(url)
@classmethod
async def get_full_data(cls) -> List[Self]:
async def get_full_data(cls) -> List["WikiModel"]:
"""获取全部数据的 Model
Returns:
@ -144,7 +145,7 @@ class WikiModel(Model):
return [i async for i in cls.full_data_generator()]
@classmethod
async def full_data_generator(cls) -> AsyncIterator[Self]:
async def full_data_generator(cls) -> AsyncIterator["WikiModel"]:
"""Model 生成器
这是一个异步生成器该函数在使用时会爬取所有数据并将其转为对应的 Model然后存至一个队列中
@ -153,12 +154,17 @@ class WikiModel(Model):
Returns:
返回能爬到的所有的 WikiModel 所组成的 List
"""
queue: Queue[Self] = Queue() # 存放 Model 的队列
queue: Queue["WikiModel"] = Queue() # 存放 Model 的队列
signal = Value("i", 0) # 一个用于异步任务同步的信号
async def task(u):
# 包装的爬虫任务
try:
await queue.put(await cls._scrape(u)) # 爬取一条数据,并将其放入队列中
except Exception as exc: # pylint: disable=W0703
logger.error("爬取数据出现异常 %s", str(exc))
logger.debug("异常信息", exc_info=exc)
finally:
signal.value -= 1 # 信号量减少 1 ,说明该爬虫任务已经完成
for _, url in await cls.get_name_list(with_url=True): # 遍历爬取所有需要爬取的页面

View File

@ -124,6 +124,7 @@ class Association(Enum):
Inazuma = "稻妻"
Liyue = "璃月"
Mondstadt = "蒙德"
Fontaine = "枫丹"
@classmethod
def convert(cls, string: str) -> Optional[Self]: