python_weather/sobear/GetCitiesUrl.py

import csv
import requests
from fake_useragent import UserAgent
import pandas as pd
import random


# ip池构建
# 用于存储IP地址和端口号的列表
ip_port_list = []
username = 'd3347396121'
password = 'ufinmek6'
# 读取CSV文件
with open('ips.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # 跳过第一行，即表头
    for row in reader:
        # 将IP地址和端口号以字符串形式拼接，并添加到列表中
        ip_port = ':'.join(row)  # 在IP地址和端口号之前添加"https://"
        ip_port_list.append(ip_port)
random_proxy = random.choice(ip_port_list)
proxies_dict = {
    "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": random_proxy},
    "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": random_proxy}
}


# ----------------爬取各城市链接------------
ua = UserAgent()
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Connection': 'keep-alive',
    'User-Agent': ua.random
}

url = 'http://www.nmc.cn/f/rest/province'  # 替换成实际的URL

#  public
# 发送GET请求获取响应对象
def get_response(url):
    response = requests.get(url, headers=headers,proxies=proxies_dict)
    return response


# 获取城市代码
# 从给定的JSON数据中找到指定name对应的code
def find_code_by_name(json_data, target_name):
    for item in json_data:
        if item['name'] == target_name:
            return item['code']
    return None

def get_cityCode(target_name,url):
    # 发送GET请求获取JSON数据
    response = get_response(url,)
    if response.status_code == 200:
        json_data = response.json()
        code = find_code_by_name(json_data, target_name)
        if code:
            print(f"The code for {target_name} is {code}\n-----------citycode获取完成-----------\n")
            return code
        else:
            print(f"Code not found for {target_name}")
    else:
        print("Failed to retrieve JSON data")


# 获取所有区县的'city_index', 'city_name', 'city_url'

def get_citys(url):
    # 发送GET请求获取JSON数据
    response = get_response(url)
    data = response.json()
    # 写入 CSV 文件
    with open('cities_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['city_index', 'city_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for index, city_data in enumerate(data, start=1):
            writer.writerow({
                'city_index': city_data['code'],
                'city_name': city_data['city'],
            })

    print("\n-----------cityindex-cityname获取完成-----------\n")


# 存储 city_index 列数据的列表
city_index_list = []
def get_cityIndex():
    # 打开 CSV 文件进行读取
    with open('cities_data.csv', 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        # 读取每一行数据
        for row in reader:
            # 提取 city_index 列数据并添加到列表中
            city_index_list.append(row['city_index'])

# 存储 cities_urls 列数据的列表
def get_citys():
    # 获取对应城市数据的网址
    citys_urllist = []
    get_cityIndex()
    for index in city_index_list:
        acityurl = 'http://www.nmc.cn/rest/weather?stationid=' + index
        acityurl = acityurl.replace(' ','')
        citys_urllist.append(acityurl)
    df = pd.DataFrame(citys_urllist, columns=['city_url'])
    df.to_csv('cities_urls.csv', index=False)

get_citys()