#!/usr/bin/env python # encoding: utf-8 """ @version: v1.0 @author: xag @license: Apache Licence @contact: xinganguo@gmail.com @site: http://www.xingag.top @software: PyCharm @file: spider_lagou.py @time: 2018/10/10 10:17 @description:使用selenium爬去拉勾网数据 """ from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import requests from lxml import etree import time import re # 封装一个爬虫类 class LagouSpider(object): driver_path = "/usr/local/bin/chromedriver" def __init__(self): # 初始化driver self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path) # init base url self.base_url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' # init spider result data self.positions = [] def run(self): """ 使用selenium取爬虫 :return: """ # 1.open the base_url self.driver.get(self.base_url) while True: # 2.get detail page url # 适用于第 1 页,第 2 页,第 3 页 source = self.driver.page_source # 2.1 wait for the element had be presented【 下一页 】 WebDriverWait(driver=self.driver, timeout=20).until( EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]")) ) # 3.parse the first page self.parse_list_page(source) # 4.use selenium to click the next page # 找到最后一个 span 标签:下一页 next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]") # 5.perform the click method # 注意:确保不是在最后一页 if "pager_next_disabled" in next_btn.get_attribute('class'): # 最后一页面的时候,退出应用 self.driver.quit() break else: next_btn.click() # 6.爬一页完成,就休息 1 秒钟 time.sleep(1) def parse_list_page(self, source): """ 解析一个页面的数据,获取详情页面的链接地址 :param source:页面源码数据 :return: """ # 思路:通过 a 标签【class='position_link'】 html = etree.HTML(source) links = html.xpath('//a[@class="position_link"]/@href') # 解析每一个职位的详情页面 for link in links: self.request_detail_page(link) # 注意:爬完一个详情页面,就休息 1 秒钟 time.sleep(1) def request_detail_page(self, detail_url): """ 打开详情页面 :param detail_url:详情页面的 URL :return: """ # 注意:重新切换窗口,不能覆盖之前的窗口 # 1.保证有且只有两个窗口,第一个窗口:列表页面;第二个窗口:详情页面 # self.driver.get(detail_url) self.driver.execute_script("window.open('%s')" % detail_url) self.driver.switch_to.window(self.driver.window_handles[1]) # 2.获取详情页面的内容 detail_page_source = self.driver.page_source # 3.解析详情页面 self.parse_detail_page(detail_page_source) # 4.关闭详情页面的窗口【关闭当前页面]】,并把 driver 句柄切换回列表页面 self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) def parse_detail_page(self, detail_page_source): """ 解析详情页面 :param detail_page_source: :return: """ html_element = etree.HTML(detail_page_source) # 1.利用xpath解析页面 # 【数据】职位名称 position_name = html_element.xpath("//div[@class='job-name']/span/text()")[0] job_request_spans = html_element.xpath("//dd[@class='job_request']//span") # 【数据】薪水 salary = job_request_spans[0].xpath('./text()')[0].strip() # 【数据】城市 # 【注意:利用正则表达式去除特殊符号和空格】 city_pre = job_request_spans[1].xpath('./text()')[0].strip() city = re.sub(r'[\s/]', '', city_pre) # 【数据】工作年限 work_years_pre = job_request_spans[2].xpath('./text()')[0].strip() work_years = re.sub(r'[\s/]', '', work_years_pre) # 【数据】学历 # 去掉空格、/ 符号 education_pre = job_request_spans[3].xpath('./text()')[0].strip() education = re.sub(r'[\s/]', '', education_pre) # 【数据】全职 full_time = job_request_spans[4].xpath('./text()')[0].strip() # 【数据】职位的详情信息 - 列表 desc_pre = html_element.xpath('//dd[@class="job_bt"]//text()') # 把列表转换拼接为字符串,并去掉首位的空字符 desc = ''.join(desc_pre).strip() # 【数据】公司名称 company_name = html_element.xpath('//h2[@class="fl"]/text()')[0].strip() position = { 'position_name': position_name, 'salary': salary, 'city': city, 'work_years': work_years, 'education': education, 'full_time': full_time, 'desc': desc, "company_name": company_name } print('==' * 30) print('爬取一个职位数据成功') print(position) print("==" * 30) self.positions.append(position) if __name__ == '__main__': # 1.init spider instance spider = LagouSpider() # 2.start to spider spider.run() # 3.测试数据 print(spider.positions)