爬取http://www.weather.com.cn/ 2023-2024每月数据
This commit is contained in:
parent
ef6261b146
commit
cebbbc86d2
93
main.py
93
main.py
@ -1,16 +1,89 @@
|
|||||||
# This is a sample Python script.
|
import json
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
# Press Shift+F10 to execute it or replace it with your code.
|
import httpx
|
||||||
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
|
|
||||||
|
response = httpx.get('https://j.i8tq.com/weather2020/search/city.js')
|
||||||
|
citys_str = "\n".join(response.text.split("\n")[1:])
|
||||||
|
citys_json = json.loads(citys_str)
|
||||||
|
citys_dict = {}
|
||||||
|
for city, provinces in citys_json.items():
|
||||||
|
city_list = []
|
||||||
|
for _, province_mes in provinces.items():
|
||||||
|
|
||||||
|
for province_name in province_mes:
|
||||||
|
area_id = citys_json[city][_][province_name]['AREAID']
|
||||||
|
city_list.append({province_name: area_id})
|
||||||
|
|
||||||
|
citys_dict[city] = city_list
|
||||||
|
|
||||||
|
root_url = "http://d1.weather.com.cn/calendar_new"
|
||||||
|
|
||||||
|
|
||||||
def print_hi(name):
|
def generate_year_and_month_list(start_year, start_month):
|
||||||
# Use a breakpoint in the code line below to debug your script.
|
current_date = datetime.now() # 获取当前日期
|
||||||
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
|
start_date = datetime(start_year, start_month, 1) # 构建起始日期
|
||||||
|
|
||||||
|
year_and_month_list = [] # 存储年份和月份的列表
|
||||||
|
|
||||||
|
# 从起始日期开始,逐月生成年份和月份,直到当前日期
|
||||||
|
while start_date <= current_date:
|
||||||
|
year = start_date.year
|
||||||
|
month = start_date.month
|
||||||
|
year_and_month_str = f"{year}{month:02d}" # 构建年份和月份的字符串,月份部分始终为两位数
|
||||||
|
year_and_month_list.append(year_and_month_str) # 将年份和月份添加到列表中
|
||||||
|
|
||||||
|
# 增加一个月
|
||||||
|
if start_date.month == 12: # 如果当前月份是12月,增加一年,月份重置为1月
|
||||||
|
start_date = datetime(start_date.year + 1, 1, 1)
|
||||||
|
else: # 否则,增加一个月
|
||||||
|
start_date += timedelta(days=31) # 这里简化处理,每次增加31天,不考虑月份天数的不同
|
||||||
|
|
||||||
|
return year_and_month_list
|
||||||
|
|
||||||
|
|
||||||
# Press the green button in the gutter to run the script.
|
all_dates_str = generate_year_and_month_list(2023, 1)
|
||||||
if __name__ == '__main__':
|
years = [2023, 2024]
|
||||||
print_hi('PyCharm')
|
for year in years:
|
||||||
|
for city, provinces in citys_dict.items():
|
||||||
|
for province in provinces:
|
||||||
|
for area_id in province.values():
|
||||||
|
for date in all_dates_str:
|
||||||
|
uri = f"/{year}/{area_id}_{date}.html"
|
||||||
|
headers = {
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Referer': 'http://www.weather.com.cn/',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||||||
|
}
|
||||||
|
timestamp = str(int(datetime.timestamp(datetime.now())))
|
||||||
|
params = {
|
||||||
|
'_': timestamp,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = httpx.get(
|
||||||
|
root_url+uri,
|
||||||
|
params=params,
|
||||||
|
headers=headers,
|
||||||
|
verify=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.text
|
||||||
|
encoding = response.encoding
|
||||||
|
decoded_html_content = content.encode(encoding).decode('utf-8')
|
||||||
|
json_str = decoded_html_content[len("var fc40 = "):]
|
||||||
|
|
||||||
|
# 解析 JSON 字符串为字典
|
||||||
|
data_dict = json.loads(json_str)
|
||||||
|
ls = []
|
||||||
|
for item in data_dict:
|
||||||
|
d = dict()
|
||||||
|
d["date"] = item["date"]
|
||||||
|
d["hgl"] = item["hgl"]
|
||||||
|
d["hmax"] = item["hmax"]
|
||||||
|
d["hmin"] = item["hmin"]
|
||||||
|
ls.append(d)
|
||||||
|
print(province)
|
||||||
|
print(ls)
|
||||||
|
|
||||||
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
|
|
||||||
|
40
test.py
Normal file
40
test.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Referer': 'http://www.weather.com.cn/',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||||||
|
}
|
||||||
|
timestamp = str(int(datetime.timestamp(datetime.now())))
|
||||||
|
params = {
|
||||||
|
'_': timestamp,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(
|
||||||
|
'http://d1.weather.com.cn/calendar_new/2024/101040200_202405.html',
|
||||||
|
params=params,
|
||||||
|
headers=headers,
|
||||||
|
verify=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.text
|
||||||
|
encoding = response.encoding
|
||||||
|
decoded_html_content = content.encode(encoding).decode('utf-8')
|
||||||
|
json_str = decoded_html_content[len("var fc40 = "):]
|
||||||
|
|
||||||
|
# 解析 JSON 字符串为字典
|
||||||
|
data_dict = json.loads(json_str)
|
||||||
|
ls = []
|
||||||
|
for item in data_dict:
|
||||||
|
d = dict()
|
||||||
|
d["date"] = item["date"]
|
||||||
|
d["hgl"] = item["hgl"]
|
||||||
|
d["hmax"] = item["hmax"]
|
||||||
|
d["hmin"] = item["hmin"]
|
||||||
|
ls.append(d)
|
||||||
|
print(ls)
|
Loading…
Reference in New Issue
Block a user