爬虫实战项目

发布时间:2024-12-09 02:00

Python爬虫实战:requests库应用 #生活知识# #编程教程#

import requests, time, random

from fake_useragent import UserAgent from lxml.html import etree, HTMLParser from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning from multiprocessing import Pool # 关闭Https请求警告 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) requests.packages.urllib3.disable_warnings(InsecurePlatformWarning) ua = UserAgent() number = 1 new_session_xin = 'k8935l0tr72p6dfngdfnuiukoo4n6jfn' anti_uid = '8F932282-2E08-FA10-DDDC-841EEF3E0BF3' def get_proxy(): response = requests.get('http://localhost:5010/get/').text proxy = {'http': 'http://' + response} return proxy def get_session_xin(): global anti_uid headers = { 'User-Agent': ua.random, 'Host': 'www.xin.com', 'Referer': 'https://www.xin.com/zhengzhou/baoma/', 'Cookie': 'XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9'.format(anti_uid) } response = requests.get('https://www.xin.com/search/get_wishlist_token', headers=headers, proxies=get_proxy(), verify=False) # 从响应头的Set-Cookie中,取出session_xin session_xin = response.cookies.get('session_xin', '没有') print(session_xin) return session_xin def get_list_page(page_num): global number, new_session_xin, anti_uid tm = str(time.time()).split('.')[0] url = 'https://www.xin.com/zhengzhou/baoma/i{}'.format(page_num) headers = { 'User-Agent': ua.random, 'Host': 'www.xin.com', 'Referer': 'https://www.xin.com/zhengzhou/baoma/', 'Cookie': 'RELEASE_KEY=; XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/'.format(anti_uid, tm, new_session_xin) } response = requests.get(url, headers=headers, verify=False, proxies=get_proxy()) uid = response.cookies.get('XIN_anti_uid', '') if uid: print('uid = ',uid) anti_uid = uid else: print('uid 不存在') return response.text def parse_list_page(list_page): list_pool = Pool(4) list_obj = etree.HTML(list_page, parser=HTMLParser(encoding='utf-8')) detail_urls = list_obj.cssselect('h2 .tit') for detail_url in detail_urls: detail_url = 'https:' + detail_url.attrib['href'] list_pool.apply_async(get_detail_page, args=(detail_url,), callback=parse_detail_page) list_pool.close() list_pool.join() def get_detail_page(detail_url): global number, new_session_xin, anti_uid number_list = [1525 + number, 1319 + number, 1262 + number, 1436 + number, 1561 + number, 1452 + number, 1618 + number, 1624 + number, 1632 + number, 1631 + number, 1646 + number, 1742 + number, 1814 + number, 1891 + number, 1847 + number, 2286 + number] tm = str(time.time()).split('.')[0] # 每次请求详情页数据之前,需要判断number的值,目的就是爬取详情页几条数据之后,更换session_xin的值 if number % 9 == 0: number += 1 new_session_xin = get_session_xin() get_detail_page(detail_url) # 默认情况下,get_detail_page()执行完毕,会继续向下执行代码 return headers = { 'User-Agent': ua.random, 'Host': 'www.xin.com', 'Referer': 'https://www.xin.com/zhengzhou/baoma/', 'Cookie': 'RELEASE_KEY=; XIN_bhv_oc={}; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/; XIN_CARBROWSE_IDS=%5B67720293%5D; XIN_bhv_pc={}; XIN_bhv_expires=1530597119591'.format(anti_uid, random.choice(number_list), tm, new_session_xin, number) } response = requests.get(detail_url, headers=headers, verify=False, proxies=get_proxy()) return response.text, detail_url def parse_detail_page(detail_tuple): global number detail_page = detail_tuple[0] detail_url = detail_tuple[1] detail_obj = etree.HTML(detail_page, parser=HTMLParser(encoding='utf-8')) try: title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[3] except Exception as e: title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[0].strip() price = detail_obj.xpath('//span[@class="cd_m_info_jg"]/b/text()')[0].strip() print(detail_url, title, price) number += 1 if __name__ == '__main__': pool = Pool(4) for x in range(1, 51): print('开始获取第{}页...'.format(x)) pool.apply_async(get_list_page, args=(x,), callback=parse_list_page) pool.close() pool.join()

天眼

import requests,time from lxml.html import etree from fake_useragent import UserAgent from urllib.parse import quote from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning from multiprocessing import Pool # from fontTools.ttLib import TTFont # 关闭Https请求警告 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) requests.packages.urllib3.disable_warnings(InsecurePlatformWarning) ua = UserAgent() number_dict1 = { '0': '9', '1': '2', '2': '1', '3': '4', '4': '7', '5': '8', '6': '3', '7': '5', '8': '6', '9': '0', '-': '-' } # 8935: 6048 # 8936: 6043 # 8936-94-90: 2017-08-09 # 8936-95-84: 2017-06-28 number_dict2 = { '0': '9', '1': '4', '2': '5', '3': '1', '4': '8', '5': '6', '6': '7', '7': '3', '8': '2', '9': '0', '-': '-' } KEY_WORD = '智游' # response = requests.get('https://static.tianyancha.com/fonts-styles/fonts/49/49631975/tyc-num.woff').text def get_proxy(): response = requests.get('http://localhost:5010/get/').text proxy = {'http': 'http://' + response} return proxy def get_list_page(page_num): tm = str(time.time()).split('.')[0] headers = { 'User-Agent': ua.random, 'Host': 'www.tianyancha.com', 'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm) } list_url = 'https://www.tianyancha.com/search/p{}?key={}'.format(page_num, quote(KEY_WORD)) response = requests.get(list_url, headers=headers, verify=False, proxies=get_proxy()) return response.text def parse_list_page(list_page): list_html = etree.HTML(list_page, parser=etree.HTMLParser(encoding='utf-8')) divs = list_html.cssselect('.search_row_new') all_a = list_html.cssselect('.query_name') detail_pool = Pool(4) for x in range(len(divs)): div = divs[x] detail_url = all_a[x].attrib['href'] try: person = div.cssselect('.legalPersonName')[0].text zhuceziben = div.xpath('.//span[contains(@title, "人民币")]/text()')[0] except Exception: continue else: zhuceshijian = div.xpath('.//span[contains(@title, "-")]/text()')[0] detail_pool.apply_async(get_detail_page, args=(detail_url, person, zhuceziben, zhuceshijian), callback=parse_detail_page) detail_pool.close() detail_pool.join() def get_detail_page(detail_url, person, zhuceziben, zhuceshijian): tm = str(time.time()).split('.')[0] headers = { 'User-Agent': ua.random, 'Host': 'www.tianyancha.com', 'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm) } response = requests.get(detail_url, headers=headers, proxies=get_proxy()) return response.text, detail_url, person, zhuceziben, zhuceshijian def parse_detail_page(detail_tuple): detail_html = detail_tuple[0] detail_url, person, zhuceziben, zhuceshijian = detail_tuple[1], detail_tuple[2], detail_tuple[3], detail_tuple[4] detail_obj = etree.HTML(detail_html, parser=etree.HTMLParser(encoding='utf-8')) res_str = '' try: date_str = detail_obj.cssselect('.base0910 .tyc-num')[0].text except Exception: pass else: for res in date_str: res_str += number_dict1[res] # 如果number_dict1第一套规则匹配失败,尝试第二套规则转化。 if res_str[0] != '2': res_str = '' for res in date_str: res_str += number_dict2[res] print(detail_url, person, zhuceziben, zhuceshijian, res_str) if __name__ == '__main__': pool = Pool(1) for x in range(1, 6): pool.apply_async(get_list_page, args=(x,), callback=parse_list_page) pool.close() pool.join()

网址:爬虫实战项目 https://www.yuejiaxmz.com/news/view/419696

相关内容

《Python3爬虫、数据清洗和可视化实战》之阅读不懂处、主要代码总结(5
python爬虫代码
推荐这三款自动化爬虫软件,非常实用!
python爬虫
住一楼如何防虫爬进屋
Python爬虫山东济南景点数据可视化和景点推荐系统 开题报告
python爬虫与数据分析之《向往的生活爬取》
Harmony鸿蒙实战开发项目
"爬山:一项结合自然美景与健身的完美运动"
攀岩拓展项目

随便看看