账号密码登录
微信安全登录
微信扫描二维码登录

登录后绑定QQ、微信即可实现信息互通

手机验证码登录
找回密码返回
邮箱找回 手机找回
注册账号返回
其他登录方式
分享
  • 收藏
    X
    拉勾网总是报错
    20
    0

    import requests
    from config.ip_pool import get_ip
    from middlewares import *
    import json, random, time
    from lxml import etree
    from fake_useragent import UserAgent
    import yt_common.factory
    import re

    class lagou():

    def __init__(self):
        self.ua = UserAgent()
        self.http = yt_common.factory.Factory.get_instance("project")
    
    def get_content(self):
        cookies_str = "user_trace_token=20180909010719-4eb82332-59f2-4979-b7ba-4a96de35eb40; _ga=GA1.2.1153938840.1536426437; LGUID=20180909010720-a5755fe0-b389-11e8-8ccd-525400f775ce; _qddaz=QD.wx1cg9.ftx1wj.jnl51m1t; JSESSIONID=ABAAABAAADEAAFIE6475DE07CCCE2D0833999916DC6AED6; utm_source=m_cf_seo_ald_wap; fromsite=""; TG-TRACK-CODE=jobs_similar; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22%24device_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22m_cf_cpt_baidu_pc%22%2C%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; index_location_city=%E5%8C%97%E4%BA%AC; WEBTJ-ID=12252018%2C161612-167e46f847741d-04ce0d97f54b0f-163b6953-1296000-167e46f8478f5f; _gid=GA1.2.1277196703.1545725773; X_HTTP_TOKEN=3dec5bde9264a1350e562709684512ea; LG_LOGIN_USER_ID=aa0676d165159370bc5d629d9b5a41215c2b10b329a917bb; _putrc=73B45C3A2AAE9C2E; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B76572; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545816032,1545873968,1545878834,1545904413; LGSID=20181227175333-45c3f23b-09bd-11e9-b129-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xfb7d4ab90001af54%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26oq%3Dnohup%252520%2525E5%2525A4%252584%2525E7%252590%252586%26rsv_t%3D2a73cVwL843%252Ba5Ai2lBIHgKBBA9Hf58WCmSNIrhGhaXjOjWtQO46%252Fa1hW5BKfpVlE%252BnB%26inputT%3D4637%26rsv_pq%3Dbb8ccaa20001742c%26rsv_sug3%3D70%26rsv_sug1%3D54%26rsv_sug7%3D100%26bs%3Dnohup%2520%25E5%25A4%2584%25E7%2590%2586; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; gate_login_token=fa8999aa6d617649ff083782230eac8ba8c9cc1520ae502f; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545904416; LGRID=20181227175336-475b23dd-09bd-11e9-ad84-5254005c3644"
        headers = {
            "User-Agent": self.ua.random,
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language":"en-US,en;q=0.9",
             "Cache-Control": "no-cache",
             "Pragma": "no-cache",
            "Cookie": cookies_str,
            "X-Anit-Forge-Code": "0",
            "Connection": "keep-alive",
            "X-Anit-Forge-Token": "None",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Host": "www.lagou.com"}
        header = {"Upgrade-Insecure-Requests": "1", "Host": "www.lagou.com", "User-Agent": self.ua.random,
                  "Cookie": cookies_str, "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive",
                  "Cache-Control": "max-age=0", "Accept-Language": "en-US,en;q=0.9",
                  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Language": "en-US,en;q=0.9","Cache-Control": "no-cache","Pragma": "no-cache"}
    
        url_list = ['https://www.lagou.com/gongsi/0-2-0-0.json', 'https://www.lagou.com/gongsi/0-1-0-0.json',
                    'https://www.lagou.com/gongsi/0-3-0-0.json']
        for url in url_list:
            headers["Referer"] = re.findall('(.*?)\.json', url)[0]
            header["Referer"] = re.findall('(.*?)\.json', url)[0]
            for i in range(20, 0,-1):
    
                print('进入的页面为%d' % i)
                print('网址为%s' % url)
                form_data = {"first": "false", "pn": {}, "sortField": "0", "havemark": "0".format(i)}
                try:
                    response = requests.post(url=url, headers=headers, data=form_data,
                                             proxies=get_ip())
                    time.sleep(random.randint(10, 20))
    
                except Exception as e:
                    response = requests.post(url=url, headers=headers, data=form_data,
                                             proxies=get_ip())
                    print(e)
                print(response.text)
    
                json_data = json.loads(response.text)
    
                for j in range(0, len(json_data['result'])):
                    companyId = json_data['result'][j]['companyId']
                    companyFullName = json_data['result'][j]['companyFullName']
                    companyShortName = json_data['result'][j]['companyShortName']
                    companyLogoLink = 'https://www.lgstatic.com/thumbnail_300x300/' + str(
                        json_data['result'][j]['companyLogo'])
                    companyFeatures = json_data['result'][j]['companyFeatures']
                    companyLink = "https://www.lagou.com/gongsi/" + str(json_data['result'][j]['companyId']) + ".html"
                    companyCity = json_data['result'][j]['city']
                    companySize = json_data['result'][j]['companySize']
                    financeStage = json_data['result'][j]['financeStage']
                    industryField = json_data['result'][j]['industryField'].replace('、', ",")
    
                    try:
    
                        res = requests.get(companyLink, headers=header,
                                           proxies=get_ip())
    
                        time.sleep(random.randint(10, 30))
    
                    except Exception as e:
                        print(e)
                        res = requests.get(companyLink, headers=header,
                                           proxies=get_ip())
                    print(companyLink)
                    print(res.url)
    
                    teamInfo = {}
                    companyLink=etree.HTML(res.text).xpath('//div[@class="company_main"]/h1/a/@href')[0]
                    print("官网网址%s"%companyLink)
                    companyAddress = etree.HTML(res.text).xpath('//p[@class="mlist_li_desc"]/text()')
                    companyAddress = [ad.strip() for ad in companyAddress]
    
                    name = etree.HTML(res.text).xpath('//p[@class="item_manager_name"]/span/text()')
    
                    instro = etree.HTML(res.text).xpath('//div[@class="item_manager_content"]/p/text()|//div[@class="item_manager_content"]/text()')
    
                    title = etree.HTML(res.content.decode('utf-8')).xpath('//p[@class="item_manager_title"]/text()')
                    print(name)
                    print(instro)
                    print(title)
    
                    for i in range(0, len(name)):
                        teamInfo.setdefault(str(i), {}).setdefault("name", name[i])
                        teamInfo.setdefault(str(i), {}).setdefault("title", title[i])
                        if len(instro) != 0:
                            teamInfo.setdefault(str(i), {}).setdefault("instro", ''.join(instro[i].split()))
                        else:
                            teamInfo.setdefault(str(i), {}).setdefault("instro", "")
    
                    data = json.dumps({"companyId": companyId, "companyFullName": companyFullName,
                                       "companyShortName": companyShortName, "companyLogoLink": companyLogoLink,
                                       "companyFeature": companyFeatures,
                                       "company_link": companyLink,
                                       "companyCity": companyCity,
                                       "companySize": companySize,
                                       "financeStage": financeStage,
                                       "industryField": industryField,
                                       "companyAddress": companyAddress,
                                       "companyTeam": teamInfo}).encode('utf-8').decode(
                        'unicode_escape')
    
                    lagou_response = self.http.set_post().http_send("/spider/source/save-lagou", {"data": data})
                    print(lagou_response)
                    time.sleep(random.randint(3, 5))
                    print('\n')
    
    

    if name == '__main__':

    pass
    
    

    这是拉勾网的代码。 我在爬页面的时候总是会遇到有一些反爬,比如遇到了网关错误 502什么的

    0
    打赏
    收藏
    点击回答
        全部回答
    • 0
    • 战国策 普通会员 1楼
      502 Bad Gateway

      502 Bad Gateway


      nginx
    更多回答
    扫一扫访问手机版
    • 回到顶部
    • 回到顶部