拉勾网总是报错 - 知否问答 - 万象云+社区

def __init__(self): self.ua = UserAgent() self.http = yt_common.factory.Factory.get_instance("project") def get_content(self): cookies_str = "user_trace_token=20180909010719-4eb82332-59f2-4979-b7ba-4a96de35eb40; _ga=GA1.2.1153938840.1536426437; LGUID=20180909010720-a5755fe0-b389-11e8-8ccd-525400f775ce; _qddaz=QD.wx1cg9.ftx1wj.jnl51m1t; JSESSIONID=ABAAABAAADEAAFIE6475DE07CCCE2D0833999916DC6AED6; utm_source=m_cf_seo_ald_wap; fromsite=""; TG-TRACK-CODE=jobs_similar; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22%24device_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22m_cf_cpt_baidu_pc%22%2C%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; index_location_city=%E5%8C%97%E4%BA%AC; WEBTJ-ID=12252018%2C161612-167e46f847741d-04ce0d97f54b0f-163b6953-1296000-167e46f8478f5f; _gid=GA1.2.1277196703.1545725773; X_HTTP_TOKEN=3dec5bde9264a1350e562709684512ea; LG_LOGIN_USER_ID=aa0676d165159370bc5d629d9b5a41215c2b10b329a917bb; _putrc=73B45C3A2AAE9C2E; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B76572; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545816032,1545873968,1545878834,1545904413; LGSID=20181227175333-45c3f23b-09bd-11e9-b129-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xfb7d4ab90001af54%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26oq%3Dnohup%252520%2525E5%2525A4%252584%2525E7%252590%252586%26rsv_t%3D2a73cVwL843%252Ba5Ai2lBIHgKBBA9Hf58WCmSNIrhGhaXjOjWtQO46%252Fa1hW5BKfpVlE%252BnB%26inputT%3D4637%26rsv_pq%3Dbb8ccaa20001742c%26rsv_sug3%3D70%26rsv_sug1%3D54%26rsv_sug7%3D100%26bs%3Dnohup%2520%25E5%25A4%2584%25E7%2590%2586; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; gate_login_token=fa8999aa6d617649ff083782230eac8ba8c9cc1520ae502f; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545904416; LGRID=20181227175336-475b23dd-09bd-11e9-ad84-5254005c3644" headers = { "User-Agent": self.ua.random, "Accept-Encoding": "gzip, deflate, br", "Accept-Language":"en-US,en;q=0.9", "Cache-Control": "no-cache", "Pragma": "no-cache", "Cookie": cookies_str, "X-Anit-Forge-Code": "0", "Connection": "keep-alive", "X-Anit-Forge-Token": "None", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Host": "www.lagou.com"} header = {"Upgrade-Insecure-Requests": "1", "Host": "www.lagou.com", "User-Agent": self.ua.random, "Cookie": cookies_str, "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Language": "en-US,en;q=0.9","Cache-Control": "no-cache","Pragma": "no-cache"} url_list = ['https://www.lagou.com/gongsi/0-2-0-0.json', 'https://www.lagou.com/gongsi/0-1-0-0.json', 'https://www.lagou.com/gongsi/0-3-0-0.json'] for url in url_list: headers["Referer"] = re.findall('(.*?)\.json', url)[0] header["Referer"] = re.findall('(.*?)\.json', url)[0] for i in range(20, 0,-1): print('进入的页面为%d' % i) print('网址为%s' % url) form_data = {"first": "false", "pn": {}, "sortField": "0", "havemark": "0".format(i)} try: response = requests.post(url=url, headers=headers, data=form_data, proxies=get_ip()) time.sleep(random.randint(10, 20)) except Exception as e: response = requests.post(url=url, headers=headers, data=form_data, proxies=get_ip()) print(e) print(response.text) json_data = json.loads(response.text) for j in range(0, len(json_data['result'])): companyId = json_data['result'][j]['companyId'] companyFullName = json_data['result'][j]['companyFullName'] companyShortName = json_data['result'][j]['companyShortName'] companyLogoLink = 'https://www.lgstatic.com/thumbnail_300x300/' + str( json_data['result'][j]['companyLogo']) companyFeatures = json_data['result'][j]['companyFeatures'] companyLink = "https://www.lagou.com/gongsi/" + str(json_data['result'][j]['companyId']) + ".html" companyCity = json_data['result'][j]['city'] companySize = json_data['result'][j]['companySize'] financeStage = json_data['result'][j]['financeStage'] industryField = json_data['result'][j]['industryField'].replace('、', ",") try: res = requests.get(companyLink, headers=header, proxies=get_ip()) time.sleep(random.randint(10, 30)) except Exception as e: print(e) res = requests.get(companyLink, headers=header, proxies=get_ip()) print(companyLink) print(res.url) teamInfo = {} companyLink=etree.HTML(res.text).xpath('//div[@class="company_main"]/h1/a/@href')[0] print("官网网址%s"%companyLink) companyAddress = etree.HTML(res.text).xpath('//p[@class="mlist_li_desc"]/text()') companyAddress = [ad.strip() for ad in companyAddress] name = etree.HTML(res.text).xpath('//p[@class="item_manager_name"]/span/text()') instro = etree.HTML(res.text).xpath('//div[@class="item_manager_content"]/p/text()|//div[@class="item_manager_content"]/text()') title = etree.HTML(res.content.decode('utf-8')).xpath('//p[@class="item_manager_title"]/text()') print(name) print(instro) print(title) for i in range(0, len(name)): teamInfo.setdefault(str(i), {}).setdefault("name", name[i]) teamInfo.setdefault(str(i), {}).setdefault("title", title[i]) if len(instro) != 0: teamInfo.setdefault(str(i), {}).setdefault("instro", ''.join(instro[i].split())) else: teamInfo.setdefault(str(i), {}).setdefault("instro", "") data = json.dumps({"companyId": companyId, "companyFullName": companyFullName, "companyShortName": companyShortName, "companyLogoLink": companyLogoLink, "companyFeature": companyFeatures, "company_link": companyLink, "companyCity": companyCity, "companySize": companySize, "financeStage": financeStage, "industryField": industryField, "companyAddress": companyAddress, "companyTeam": teamInfo}).encode('utf-8').decode( 'unicode_escape') lagou_response = self.http.set_post().http_send("/spider/source/save-lagou", {"data": data}) print(lagou_response) time.sleep(random.randint(3, 5)) print('\n')

回答动态