120 lines
3.7 KiB
Python
120 lines
3.7 KiB
Python
import csv
|
||
import requests
|
||
from fake_useragent import UserAgent
|
||
import pandas as pd
|
||
import random
|
||
|
||
|
||
# ip池构建
|
||
# 用于存储IP地址和端口号的列表
|
||
ip_port_list = []
|
||
username = 'd3347396121'
|
||
password = 'ufinmek6'
|
||
# 读取CSV文件
|
||
with open('ips.csv', newline='') as csvfile:
|
||
reader = csv.reader(csvfile)
|
||
next(reader) # 跳过第一行,即表头
|
||
for row in reader:
|
||
# 将IP地址和端口号以字符串形式拼接,并添加到列表中
|
||
ip_port = ':'.join(row) # 在IP地址和端口号之前添加"https://"
|
||
ip_port_list.append(ip_port)
|
||
random_proxy = random.choice(ip_port_list)
|
||
proxies_dict = {
|
||
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": random_proxy},
|
||
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": random_proxy}
|
||
}
|
||
|
||
|
||
|
||
# ----------------爬取各城市链接------------
|
||
ua = UserAgent()
|
||
headers = {
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Encoding': 'gzip, deflate, sdch',
|
||
'Accept-Language': 'zh-CN,zh;q=0.8',
|
||
'Connection': 'keep-alive',
|
||
'User-Agent': ua.random
|
||
}
|
||
|
||
url = 'http://www.nmc.cn/f/rest/province' # 替换成实际的URL
|
||
|
||
# public
|
||
# 发送GET请求获取响应对象
|
||
def get_response(url):
|
||
response = requests.get(url, headers=headers,proxies=proxies_dict)
|
||
return response
|
||
|
||
|
||
|
||
# 获取城市代码
|
||
# 从给定的JSON数据中找到指定name对应的code
|
||
def find_code_by_name(json_data, target_name):
|
||
for item in json_data:
|
||
if item['name'] == target_name:
|
||
return item['code']
|
||
return None
|
||
|
||
def get_cityCode(target_name,url):
|
||
# 发送GET请求获取JSON数据
|
||
response = get_response(url,)
|
||
if response.status_code == 200:
|
||
json_data = response.json()
|
||
code = find_code_by_name(json_data, target_name)
|
||
if code:
|
||
print(f"The code for {target_name} is {code}\n-----------citycode获取完成-----------\n")
|
||
return code
|
||
else:
|
||
print(f"Code not found for {target_name}")
|
||
else:
|
||
print("Failed to retrieve JSON data")
|
||
|
||
|
||
|
||
|
||
|
||
# 获取所有区县的'city_index', 'city_name', 'city_url'
|
||
|
||
def get_citys(url):
|
||
# 发送GET请求获取JSON数据
|
||
response = get_response(url)
|
||
data = response.json()
|
||
# 写入 CSV 文件
|
||
with open('cities_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||
fieldnames = ['city_index', 'city_name']
|
||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||
|
||
writer.writeheader()
|
||
for index, city_data in enumerate(data, start=1):
|
||
writer.writerow({
|
||
'city_index': city_data['code'],
|
||
'city_name': city_data['city'],
|
||
})
|
||
|
||
print("\n-----------cityindex-cityname获取完成-----------\n")
|
||
|
||
|
||
# 存储 city_index 列数据的列表
|
||
city_index_list = []
|
||
def get_cityIndex():
|
||
# 打开 CSV 文件进行读取
|
||
with open('cities_data.csv', 'r', newline='', encoding='utf-8') as csvfile:
|
||
reader = csv.DictReader(csvfile)
|
||
# 读取每一行数据
|
||
for row in reader:
|
||
# 提取 city_index 列数据并添加到列表中
|
||
city_index_list.append(row['city_index'])
|
||
|
||
# 存储 cities_urls 列数据的列表
|
||
def get_citys():
|
||
# 获取对应城市数据的网址
|
||
citys_urllist = []
|
||
get_cityIndex()
|
||
for index in city_index_list:
|
||
acityurl = 'http://www.nmc.cn/rest/weather?stationid=' + index
|
||
acityurl = acityurl.replace(' ','')
|
||
citys_urllist.append(acityurl)
|
||
df = pd.DataFrame(citys_urllist, columns=['city_url'])
|
||
df.to_csv('cities_urls.csv', index=False)
|
||
|
||
get_citys()
|