python3利用selenium自动获取阿里社会招聘信息到表格(反爬selenium代码)

发布时间:2024-11-27 07:35

就业信息获取:关注招聘会和职业讲座 #生活知识# #社会生活# #就业指导#

最新推荐文章于 2023-11-16 19:44:24 发布

执笔写回憶 于 2019-03-04 18:53:35 发布

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。

导入的debug_info包:https://blog.csdn.net/z564359805/article/details/85624881

from selenium import webdriver

import time

from debug_info import Log_info

from openpyxl import Workbook

from selenium.webdriver import ChromeOptions

class Saas(object):

def __init__(self):

option = ChromeOptions()

option.add_experimental_option('excludeSwitches', ['enable-automation'])

self.driver = webdriver.Chrome(options=option)

self.driver.maximize_window()

self.wb = Workbook()

self.ws = self.wb.active

self.top = ['序号', '职位名称', '职位类别', '工作地点', '招聘人数', '更新时间','工作年限','所属部门','学历',

'岗位情况1','岗位情况2']

def ali(self, logger):

self.driver.get('https://job.alibaba.com/zhaopin/positionList.htm#page/1')

total_page = self.driver.find_element_by_xpath('//*[@id="J-pagination"]/div/ul/li[6]/a').text

logger("打开首页,当前共获取到%s页" % total_page)

while True:

input_page = input("请输入要获取的页数,不输入则全部获取(共%s页):" % total_page)

if input_page == '':

logger("全部数据获取中...")

input_page = int(total_page)

self.get_date(logger, input_page)

break

else:

if input_page.isdigit():

if int(input_page) > int(total_page):

logger("输入的页数超过最大页数,请重新输入!")

continue

elif int(input_page) == 0:

logger("不能为0")

continue

else:

self.get_date(logger, int(input_page))

break

else:

logger("请输入正确页数!")

continue

time.sleep(2)

file_end_name = time.strftime("%Y-%m-%d", time.localtime())

self.wb.save('alibaba' + file_end_name + '.xlsx')

logger("全部处理完成!")

def get_date(self, logger, input_page):

result_date = []

add_top = []

for i in range(1, input_page + 1):

time.sleep(1)

number_list = []

url = self.driver.current_url

page = int((url).split("/")[-1])

logger("当前第%s页数据获取中..." % page)

job_title = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[1]/span/a')

length = len(job_title)

for j in range((i - 1) * length + 1, (i - 1) * length + 11):

number_list.append(j)

job_category = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[2]/span')

job_address = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[3]/span')

job_number = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[4]/span')

job_date = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[5]/span')

for k in range(len(number_list)):

all_date = []

all_date.append(number_list[k])

all_date.append(job_title[k].text.strip())

all_date.append(job_category[k].text.strip())

all_date.append(job_address[k].text.strip())

all_date.append(job_number[k].text.strip())

all_date.append(job_date[k].text.strip())

time.sleep(1)

logger("点击第%s页第%s个详情" % (page, k + 1))

self.driver.find_element_by_xpath(

'//*[@id="J-list-box"]/tr[' + str(2 * k + 1) + ']/td[1]/span/a').click()

self.all_handles = self.driver.window_handles

logger("切换到新窗口")

self.driver.switch_to.window(self.all_handles[1])

time.sleep(2)

working_life = self.driver.find_element_by_xpath('//table[@class="detail-table box-border"]/tbody/tr[1]/td[6]').text

all_date.append(working_life)

department = self.driver.find_element_by_xpath('//table[@class="detail-table box-border"]/tbody/tr[2]/td[2]').text

all_date.append(department)

education = self.driver.find_element_by_xpath('//table[@class="detail-table box-border"]/tbody/tr[2]/td[4]').text

all_date.append(education)

self.driver.close()

time.sleep(1)

logger("切回第一个窗口")

self.driver.switch_to.window(self.all_handles[0])

logger("展开第%s页第%s个按钮" % (page, k + 1))

self.driver.find_element_by_xpath('//*[@id="J-list-box"]/tr[' + str(2 * k + 1) + ']/td[6]/a').click()

job_content = self.driver.find_elements_by_xpath(

'//*[@id="J-list-box"]/tr[' + str(2 * k + 2) + ']/td/div/p')

for content_p in range(len(job_content)):

all_date.append(job_content[content_p].text.strip().replace("\n", "").replace("分享到:", ""))

if len(job_content) > 2:

logger("序号:%s,岗位:%s情况大于2段请查看原文"%(number_list[k],job_title[k].text.strip()))

for p_length in range(3,len(job_content)+1):

if "岗位情况"+str(p_length) not in self.top:

self.top.append("岗位情况"+str(p_length))

time.sleep(1)

logger("关闭第%s页第%s个按钮" % (page, k + 1))

self.driver.find_element_by_xpath('//*[@id="J-list-box"]/tr[' + str(2 * k + 1) + ']/td[6]/a').click()

time.sleep(1)

result_date.append(all_date)

time.sleep(1)

if input_page > page:

logger("点击下一页")

self.driver.find_element_by_xpath('//*[@id="J-pagination"]/div/ul/li[@data-index="next"]/a').click()

time.sleep(1)

logger("第%s页处理完成!" % page)

for row in range(len(result_date)):

for col in range(len(self.top)):

if result_date[row][col]:

value = result_date[row][col]

else:

value = ""

self.ws.cell(row=1,column= col+1,value=self.top[col])

self.ws.cell(row=row + 2, column=col + 1, value=value)

if __name__ == "__main__":

logger = Log_info().main()

time.sleep(2)

saas = Saas()

try:

saas.ali(logger)

except Exception as e:

logger("出现异常:", e)

网址:python3利用selenium自动获取阿里社会招聘信息到表格(反爬selenium代码) https://www.yuejiaxmz.com/news/view/287127

相关内容

Selenium 流程自动化
自动化测试框架应该怎么选?Selenium、Playwright和Cypress详细对比
web自动化测试——通过pip安装selenium
selenium 定位方式3
python爬虫代码
软件测试实训|界面自动化测试工具Selenium IDE 录制回放
推荐一款新的自动化测试框架:DrissionPage
推荐这三款自动化爬虫软件,非常实用!
告别重复任务!帮你实现自动化生活的4个网页抓取项目
推荐几款常用Web自动化测试神器!

随便看看