摘要:本篇教程探讨了大数据采集之爬取腾讯招聘python岗位,希望阅读本篇文章以后大家有所收获,帮助大家对相关内容的理解更加深入。
本篇教程探讨了大数据采集之爬取腾讯招聘python岗位,希望阅读本篇文章以后大家有所收获,帮助大家对相关内容的理解更加深入。
import requestsfrom lxml import etreeimport simplejsonimport timeHEADER ={ "Cookie": "PHPSESSID=8ir8s188dp7k6r5sjjoe32p946; pgv_pvi=877227008; pgv_si=s8137142272", "Referer": "https://hr.tencent.com/social.php", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.100 Chrome/69.0.3497.100 Safari/537.36", "pgrade-Insecure-Requests": "1" }#获取岗位详情链接以及共计页数def get_detail_url(url): #获取初始(种子)网页 response = requests.get(url,headers=HEADER) #解析网页 html = etree.HTML(response.text) page_numbers = html.xpath("//div[@class='pagenav']//a/text()")[-2] #获取岗位详情链接 detail_url = html.xpath("//tr[@class='even']//a/@href") base_url = "https://hr.tencent.com/" links = map(lambda url:base_url+url,detail_url) return links,page_numbers#提取岗位信息def parse_detail(url): position = {} response = requests.get(url,headers=HEADER) html = etree.HTML(response.text) title = html.xpath("//td[@id='sharetitle']/text()")[0] tds = html.xpath("//tr[@class='c bottomline']/td") location = tds[0].xpath(".//text()")[-1] work_info = html.xpath("//ul[@class='squareli']") print(work_info) duty = work_info[0].xpath(".//text()") require = work_info[1].xpath(".//text()") position['title'] = title position['location'] = location position['duty'] = duty position['require'] = require return position#调用函数爬取网页def spider(): positions = [] urls, page_number = get_detail_url("https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=0#a") for i in range(0,int(page_number)): time.sleep(5) KEYWORD = "python" START_URL = "https://hr.tencent.com/position.php?lid=&tid=&keywords={}&start={}#a".format(KEYWORD,i*10) urls, page_number = get_detail_url(START_URL) for url in urls: time.sleep(3) position = parse_detail(url) print("+++++++++++"*50,position) positions.append(position) with open('tencent.json','w',encoding='utf-8') as f: simplejson.dump(positions,f) return positionsif __name__ == '__main__': spider()
本文由职坐标整理发布,学习更多的相关知识,请关注职坐标IT知识库!
您输入的评论内容中包含违禁敏感词
我知道了
请输入正确的手机号码
请输入正确的验证码
您今天的短信下发次数太多了,明天再试试吧!
我们会在第一时间安排职业规划师联系您!
您也可以联系我们的职业规划师咨询:
版权所有 职坐标-一站式IT培训就业服务领导者 沪ICP备13042190号-4
上海海同信息科技有限公司 Copyright ©2015 www.zhizuobiao.com,All Rights Reserved.
沪公网安备 31011502005948号