python爬虫结果出现五次重复 - 知否问答 - 万象云+社区

from bs4 import BeautifulSoup import requests import time # 1.第一步：把网页数据全部抓下来(requests) # 2.第二步：把抓下来的数据进行过滤，把需要的数据提取出来，把不需要得过滤掉(bs4) #get/post def get_temperature(url): headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 'Referer':'http://www.weather.com.cn/textFC/hb.shtml', 'Host':'www.weather.com.cn' } data=requests.get(url,headers=headers) #如果我直接打上print(data.content),会出现编码错误，中文显示英文, #上面的代码正常但是运行的时候结果遇到中文会以\xe7\x99\xbe\xe5\xba\xa6\xe4\xb8\x80代替，这是一种byte字节。. #python 3输出位串，而不是可读的字符串，需要对其进行转换 #需要在前面加上一个转换——html =str(data.content,'utf-8') h =str(data.content,'utf-8') #print (h) user=h #真正有用的数据，div class table tr td soup=BeautifulSoup(user,'lxml') conMidtab=soup.find_all('div',class_="conMidtab") conMidtab2_list=soup.find_all('div',class_="conMidtab2") for x in conMidtab2_list: tr_list=x.find_all('tr')[2:]#list从0开始，省份是从第2个标签开始的 province='1'#定义 for index,tr in enumerate(tr_list): if index==0: td_list=tr.find_all('td') province=td_list[0].text.replace('\n','') city=td_list[1].text.replace('\n','') weather=td_list[5].text.replace('\n','') wind=td_list[6].text.replace('\n','') tmin=td_list[7].text.replace('\n','') else: td_list=tr.find_all('td') city=td_list[0].text.replace('\n','') weather=td_list[4].text.replace('\n','') wind=td_list[5].text.replace('\n','') tmin=td_list[6].text.replace('\n','') print ('%s %s %s %s %s' % (province,city,weather,wind,tmin))#replace('\n','')用空白代替空行 def main(): urls=['http://www.weather.com.cn/textFC/hb.shtml', 'http://www.weather.com.cn/textFC/db.shtml', 'http://www.weather.com.cn/textFC/hd.shtml', 'http://www.weather.com.cn/textFC/hz.shtml', 'http://www.weather.com.cn/textFC/hn.shtml', 'http://www.weather.com.cn/textFC/xb.shtml', 'http://www.weather.com.cn/textFC/xn.shtml'] for url in urls: get_temperature(url) time.sleep(2) if __name__=='__main__': main()

回答动态