爬取http://www.weather.com.cn/ 2023-2024每月数据

2024-05-26 22:13:52 +08:00 · 2024-05-26 22:13:52 +08:00 · cebbbc86d2
commit cebbbc86d2
parent ef6261b146
2 changed files with 123 additions and 10 deletions
--- a/main.py
+++ b/main.py
@ -1,16 +1,89 @@
-# This is a sample Python script.
+import json
+from datetime import datetime, timedelta

-# Press Shift+F10 to execute it or replace it with your code.
-# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+import httpx
+
+response = httpx.get('https://j.i8tq.com/weather2020/search/city.js')
+citys_str = "\n".join(response.text.split("\n")[1:])
+citys_json = json.loads(citys_str)
+citys_dict = {}
+for city, provinces in citys_json.items():
+    city_list = []
+    for _, province_mes in provinces.items():
+
+        for province_name in province_mes:
+            area_id = citys_json[city][_][province_name]['AREAID']
+            city_list.append({province_name: area_id})
+
+    citys_dict[city] = city_list
+
+root_url = "http://d1.weather.com.cn/calendar_new"


-def print_hi(name):
-    # Use a breakpoint in the code line below to debug your script.
-    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
+def generate_year_and_month_list(start_year, start_month):
+    current_date = datetime.now()  # 获取当前日期
+    start_date = datetime(start_year, start_month, 1)  # 构建起始日期
+
+    year_and_month_list = []  # 存储年份和月份的列表
+
+    # 从起始日期开始，逐月生成年份和月份，直到当前日期
+    while start_date <= current_date:
+        year = start_date.year
+        month = start_date.month
+        year_and_month_str = f"{year}{month:02d}"  # 构建年份和月份的字符串，月份部分始终为两位数
+        year_and_month_list.append(year_and_month_str)  # 将年份和月份添加到列表中
+
+        # 增加一个月
+        if start_date.month == 12:  # 如果当前月份是12月，增加一年，月份重置为1月
+            start_date = datetime(start_date.year + 1, 1, 1)
+        else:  # 否则，增加一个月
+            start_date += timedelta(days=31)  # 这里简化处理，每次增加31天，不考虑月份天数的不同
+
+    return year_and_month_list


-# Press the green button in the gutter to run the script.
-if __name__ == '__main__':
-    print_hi('PyCharm')
+all_dates_str = generate_year_and_month_list(2023, 1)
+years = [2023, 2024]
+for year in years:
+    for city, provinces in citys_dict.items():
+        for province in provinces:
+            for area_id in province.values():
+                for date in all_dates_str:
+                    uri = f"/{year}/{area_id}_{date}.html"
+                    headers = {
+                        'Accept': '*/*',
+                        'Accept-Language': 'zh-CN,zh;q=0.9',
+                        'Connection': 'keep-alive',
+                        'Referer': 'http://www.weather.com.cn/',
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
+                    }
+                    timestamp = str(int(datetime.timestamp(datetime.now())))
+                    params = {
+                        '_': timestamp,
+                    }
+
+                    response = httpx.get(
+                        root_url+uri,
+                        params=params,
+                        headers=headers,
+                        verify=False,
+                    )
+
+                    content = response.text
+                    encoding = response.encoding
+                    decoded_html_content = content.encode(encoding).decode('utf-8')
+                    json_str = decoded_html_content[len("var fc40 = "):]
+
+                    # 解析 JSON 字符串为字典
+                    data_dict = json.loads(json_str)
+                    ls = []
+                    for item in data_dict:
+                        d = dict()
+                        d["date"] = item["date"]
+                        d["hgl"] = item["hgl"]
+                        d["hmax"] = item["hmax"]
+                        d["hmin"] = item["hmin"]
+                        ls.append(d)
+                    print(province)
+                    print(ls)

-# See PyCharm help at https://www.jetbrains.com/help/pycharm/
--- a/test.py
+++ b/test.py
@ -0,0 +1,40 @@
+import json
+from datetime import datetime
+
+import requests
+
+headers = {
+    'Accept': '*/*',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Connection': 'keep-alive',
+    'Referer': 'http://www.weather.com.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
+}
+timestamp = str(int(datetime.timestamp(datetime.now())))
+params = {
+    '_': timestamp,
+}
+
+response = requests.get(
+    'http://d1.weather.com.cn/calendar_new/2024/101040200_202405.html',
+    params=params,
+    headers=headers,
+    verify=False,
+)
+
+content = response.text
+encoding = response.encoding
+decoded_html_content = content.encode(encoding).decode('utf-8')
+json_str = decoded_html_content[len("var fc40 = "):]
+
+# 解析 JSON 字符串为字典
+data_dict = json.loads(json_str)
+ls = []
+for item in data_dict:
+    d = dict()
+    d["date"] = item["date"]
+    d["hgl"] = item["hgl"]
+    d["hmax"] = item["hmax"]
+    d["hmin"] = item["hmin"]
+    ls.append(d)
+print(ls)