导入的debug_info包:https://blog.csdn.net/z564359805/article/details/85624881
from selenium import webdriver
import time
from debug_info import Log_info
from openpyxl import Workbook
from selenium.webdriver import ChromeOptions
class Saas(object):
def __init__(self):
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
self.driver = webdriver.Chrome(options=option)
self.driver.maximize_window()
self.wb = Workbook()
self.ws = self.wb.active
self.top = ['序号', '职位名称', '职位类别', '工作地点', '招聘人数', '更新时间','工作年限','所属部门','学历',
'岗位情况1','岗位情况2']
def ali(self, logger):
self.driver.get('https://job.alibaba.com/zhaopin/positionList.htm#page/1')
total_page = self.driver.find_element_by_xpath('//*[@id="J-pagination"]/div/ul/li[6]/a').text
logger("打开首页,当前共获取到%s页" % total_page)
while True:
input_page = input("请输入要获取的页数,不输入则全部获取(共%s页):" % total_page)
if input_page == '':
logger("全部数据获取中...")
input_page = int(total_page)
self.get_date(logger, input_page)
break
else:
if input_page.isdigit():
if int(input_page) > int(total_page):
logger("输入的页数超过最大页数,请重新输入!")
continue
elif int(input_page) == 0:
logger("不能为0")
continue
else:
self.get_date(logger, int(input_page))
break
else:
logger("请输入正确页数!")
continue
time.sleep(2)
file_end_name = time.strftime("%Y-%m-%d", time.localtime())
self.wb.save('alibaba' + file_end_name + '.xlsx')
logger("全部处理完成!")
def get_date(self, logger, input_page):
result_date = []
add_top = []
for i in range(1, input_page + 1):
time.sleep(1)
number_list = []
url = self.driver.current_url
page = int((url).split("/")[-1])
logger("当前第%s页数据获取中..." % page)
job_title = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[1]/span/a')
length = len(job_title)
for j in range((i - 1) * length + 1, (i - 1) * length + 11):
number_list.append(j)
job_category = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[2]/span')
job_address = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[3]/span')
job_number = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[4]/span')
job_date = self.driver.find_elements_by_xpath('//*[@id="J-list-box"]/tr/td[5]/span')
for k in range(len(number_list)):
all_date = []
all_date.append(number_list[k])
all_date.append(job_title[k].text.strip())
all_date.append(job_category[k].text.strip())
all_date.append(job_address[k].text.strip())
all_date.append(job_number[k].text.strip())
all_date.append(job_date[k].text.strip())
time.sleep(1)
logger("点击第%s页第%s个详情" % (page, k + 1))
self.driver.find_element_by_xpath(
'//*[@id="J-list-box"]/tr[' + str(2 * k + 1) + ']/td[1]/span/a').click()
self.all_handles = self.driver.window_handles
logger("切换到新窗口")
self.driver.switch_to.window(self.all_handles[1])
time.sleep(2)
working_life = self.driver.find_element_by_xpath('//table[@class="detail-table box-border"]/tbody/tr[1]/td[6]').text
all_date.append(working_life)
department = self.driver.find_element_by_xpath('//table[@class="detail-table box-border"]/tbody/tr[2]/td[2]').text
all_date.append(department)
education = self.driver.find_element_by_xpath('//table[@class="detail-table box-border"]/tbody/tr[2]/td[4]').text
all_date.append(education)
self.driver.close()
time.sleep(1)
logger("切回第一个窗口")
self.driver.switch_to.window(self.all_handles[0])
logger("展开第%s页第%s个按钮" % (page, k + 1))
self.driver.find_element_by_xpath('//*[@id="J-list-box"]/tr[' + str(2 * k + 1) + ']/td[6]/a').click()
job_content = self.driver.find_elements_by_xpath(
'//*[@id="J-list-box"]/tr[' + str(2 * k + 2) + ']/td/div/p')
for content_p in range(len(job_content)):
all_date.append(job_content[content_p].text.strip().replace("\n", "").replace("分享到:", ""))
if len(job_content) > 2:
logger("序号:%s,岗位:%s情况大于2段请查看原文"%(number_list[k],job_title[k].text.strip()))
for p_length in range(3,len(job_content)+1):
if "岗位情况"+str(p_length) not in self.top:
self.top.append("岗位情况"+str(p_length))
time.sleep(1)
logger("关闭第%s页第%s个按钮" % (page, k + 1))
self.driver.find_element_by_xpath('//*[@id="J-list-box"]/tr[' + str(2 * k + 1) + ']/td[6]/a').click()
time.sleep(1)
result_date.append(all_date)
time.sleep(1)
if input_page > page:
logger("点击下一页")
self.driver.find_element_by_xpath('//*[@id="J-pagination"]/div/ul/li[@data-index="next"]/a').click()
time.sleep(1)
logger("第%s页处理完成!" % page)
for row in range(len(result_date)):
for col in range(len(self.top)):
if result_date[row][col]:
value = result_date[row][col]
else:
value = ""
self.ws.cell(row=1,column= col+1,value=self.top[col])
self.ws.cell(row=row + 2, column=col + 1, value=value)
if __name__ == "__main__":
logger = Log_info().main()
time.sleep(2)
saas = Saas()
try:
saas.ali(logger)
except Exception as e:
logger("出现异常:", e)