#!/usr/bin/env python  
# encoding: utf-8  

""" 
@version: v1.0 
@author: xag 
@license: Apache Licence  
@contact: xinganguo@gmail.com 
@site: http://www.xingag.top 
@software: PyCharm 
@file: spider_tencent_recruit
@time: 2018/9/17 11:22 
@description：爬腾讯招聘职位信息
"""

import requests

from lxml import etree

import time

# 每页的职位数
PAGE_SIZE = 10

BASE_DOMAIN = 'https://hr.tencent.com/'

HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
	'Referer': 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=10',
	'Cookie': '_ga=GA1.2.1222789966.1535530525; pgv_pvi=8193187840; pgv_si=s2985358336; PHPSESSID=22e3m8aknd19s1gqkh0i9eisk0; Hm_lvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1536726429,1536908218,1537154694,1537166987; Hm_lpvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1537167106'
}


def get_jo_detail_urls(page_url):
	"""
	1.根据当前页面url地址获取每一个职位的详情页面url
	:param page_url:当前页面的url
	:return:
	"""
	response = requests.get(page_url, headers=HEADERS)

	html_element = etree.HTML(response.text)

	# print(etree.tostring(html_element, encoding='utf-8').decode('utf-8'))

	detail_urls = html_element.xpath('//tr[@class="even" or @class="odd"]//a/@href')

	# 获取所有职位详情页面的url
	detail_urls = map(lambda detail_url: BASE_DOMAIN + detail_url, detail_urls)

	return detail_urls


def get_detail_msg(detail_url):
	"""
	2.获取某个职位的详细数据
	:param detail_url: 职位详细页面的url
	:return: 职位数据
	"""
	# print('请求的详细地址是:' + detail_url)
	response = requests.get(detail_url, headers=HEADERS)
	html_element = etree.HTML(response.text)

	position = {}

	# 【数据】获取职位标题
	title = html_element.xpath('//tr[@class="h"]/td/text()')[0]
	position['title'] = title

	# 【数据】工作地点/职位类别
	top_infos = html_element.xpath('//tr[@class="c bottomline"]//text()')
	position['location'] = top_infos[top_infos.index('工作地点：') + 1]
	position['category'] = top_infos[top_infos.index('职位类别：') + 1]

	content_infos = html_element.xpath('//ul[@class="squareli"]')
	# 【数据】工作职责
	work_do_info = content_infos[0]
	position['duty'] = work_do_info.xpath("./li/text()")

	# 【数据】工作要求
	work_ask_info = content_infos[1]
	position['ask'] = work_ask_info.xpath('./li/text()')

	return position


def spider():
	# 0.待返回的职位数据
	positions = []

	# 1.获取前10页的职位数据
	for page_num in range(0, 10):
		print('开始爬取第{}页数据'.format(page_num + 1))

		# 2.每一页的地址
		url = 'https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a'.format(page_num * PAGE_SIZE)

		# 3.获取【当前页】所有职位的【详情页面的url】
		detail_urls = get_jo_detail_urls(url)

		# 4.一个个去解析详情页面的数据
		for detail_url in detail_urls:
			position = get_detail_msg(detail_url)
			positions.append(position)

		time.sleep(1)

	print('爬取完成！')
	print(positions)


if __name__ == '__main__':
	spider()