用python一天爬取20万条企业信息(源码)

时间：2019-08-14 13:49:00 来源：作者：

爬虫环境

Python3.7+pycharm

最近发现一个网站，首商网，上面企业信息百万以上，然而网站一点儿反爬机制都没有，这对我们喜欢爬虫的来讲岂不是太爽了，直接拿出撸一套代码，用了三次并发，每次用20条线程，爬了五六个小时，拿下了20万条数据，美滋滋！

还是老规矩，下面直接上代码，所有的注释以及解释都在代码中，可以直接运行：

for k in range(1, 1651, 50):
# -*- coding: utf-8 -*-
# 本项目是原始的异步爬虫，没有封装为函数
import asyncio
import aiohttp
import time
from bs4 import BeautifulSoup
import csv
import requests
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
# 先用并发获取每个页面的子链接
########################################################################################################################
pro = 'zhaoshuang:LINA5201314@ 14.215.44.251:28803'
proxies = {'http://': 'http://' + pro,
'httpS://': 'https://' + pro
}
# 加入请求头
headers = {'User-Agent': 'Mozilla/5.0 (windows NT 10.0; WOW64) AppleWebKit'
'/537.36 (Khtml, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
wzs = []
def parser(url):
print(url)
try:
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, "lxml")
# body > div.list_contain > div.left > div.list_li > ul > li:nth-child(1) > table > tbody > tr > td:nth-child(3) > div.title > a
wz = soup1.select('div.title')
for i in wz:
wzs.append(i.contents[0].get("href"))
time.sleep(1)
except:
print('公司正在审核中')
urls = ['http://www.sooshong.com/c-3p{}'.format(num) for num in range(k, k + 50)]
# 利用并发加速爬取，最大线程为50个，本文章中一共有50个网站，可以加入50个线程
# 建立一个加速器对象，线程数每个网站都不同，太大网站接受不了会造成数据损失
executor = ThreadPoolExecutor(max_workers=10)
# submit()的参数：第一个为函数，之后为该函数的传入参数，允许有多个
future_tasks = [executor.submit(parser, url) for url in urls]
# 等待所有的线程完成，才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
print('子页链接抓取完毕！')
########################################################################################################################
# 使用并发法爬取详细页链接
# 定义函数获取每个网页需要爬取的内容
wzs1 = []
def parser(url):
# 利用正则表达式解析网页
try:
res = requests.get(url, headers=headers)
# 对响应体进行解析
soup = BeautifulSoup(res.text, "lxml")
# 找到页面子链接，进入子页面，对子页面进行抓取
# 用select函数抽取需要的内容，单击需要的内容》检查》copy select
lianjie = soup.select('#main > div.main > div.intro > div.intros > div.text > p > a')
lianjie = lianjie[0].get('href')
wzs1.append(lianjie)
print(lianjie)
except:
print('子页解析失败')
# 利用并发加速爬取，最大线程为50个，本文章中一共有50个网站，可以加入50个线程
# 建立一个加速器对象，线程数每个网站都不同，太大网站接受不了会造成数据损失
executor = ThreadPoolExecutor(max_workers=10)
# submit()的参数：第一个为函数，之后为该函数的传入参数，允许有多个
future_tasks = [executor.submit(parser, url) for url in wzs]
# 等待所有的线程完成，才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
print('详细页链接获取完毕！')
"""
# 使用异步法抓取子页面的链接
########################################################################################################################
async def get_html(sess, ur):
try:
proxy_auth = aiohttp.BasicAuth('zhaoshuang', 'LINA5201314')
html = await sess.get(ur,
headers=headers) # , proxy='http://'+'14.116.200.33:28803', proxy_auth=proxy_auth)
r = await html.text()
return r
except:
print("error")
# f = requests.get('http://211775.sooshong.com', headers=headers)
wzs1 = []
# 解析网页
async def parser(respo):
# 利用正则表达式解析网页
try:
# 对响应体进行解析
soup = BeautifulSoup(respo, "lxml")
# 找到页面子链接，进入子页面，对子页面进行抓取
# 用select函数抽取需要的内容，单击需要的内容》检查》copy select
lianjie = soup.select('#main > div.main > div.intro > div.intros > div.text > p > a')
lianjie = lianjie[0].get('href')
wzs1.append(lianjie)
print(lianjie)
company = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong") # 标题
company = company[0].text
# 匹配电话号码
dianhua = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(3)") # 地址
dianhua = dianhua[0].text.split("：")[1]
# 匹配手机号码
phone = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(4)") # 日租价格
phone = phone[0].text.split("：")[1]
# 匹配传真
chuanzhen = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(5)") # 月租价格
chuanzhen = chuanzhen[0].text.split("：")[1]
# 经营模式
jingying = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(8)") # 面积大小
jingying = jingying[0].text.split("：")[1]
# 公司地址
address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)') # 抽取建造年份
address = address[0].text.split("：")[1]
# 公司简介
# introduction = soup.select("#main > div.main > div.intro > div.intros > div.text > p") # 楼层属性
# introduction = introduction[0].text.strip()
data = [company, address, dianhua, phone, chuanzhen, jingying]
print(data)
with open('首富网企业7.csv', 'a+', newline='', encoding='GB2312', errors='ignore') as csvfile:
w1 = csv.writer(csvfile)
w1.writerow(data, [1])

except:
print("出错！")
async def main(loop):
async with aiohttp.ClientSession() as sess:
tasks = []
for ii in wzs:
ur = ii
try:
tasks.append(loop.create_task(get_html(sess, ur)))
except:
print('error')
# 设置0.1的网络延迟增加爬取效率
await asyncio.sleep(0.1)
finished, unfinised = await asyncio.wait(tasks)
for i1 in finished:
await parser(i1.result())
if __name__ == '__main__':
t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
print("花费时间", time.time() - t1)
print('详细页链接抓取完毕！')
"""
########################################################################################################################
# 使用并发法获取详细页的内容
########################################################################################################################
# 定义函数获取每个网页需要爬取的内容
def parser(url):
global data
try:
res = requests.get(url, headers=headers)
# 对响应体进行解析
soup = BeautifulSoup(res.text, 'lxml')
# 找到页面子链接，进入子页面，对子页面进行抓取
# 用select函数抽取需要的内容，单击需要的内容》检查》copy select
company = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong')
company = company[0].text
name = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(2)')
name = name[0].text
dianhua = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(3)')
dianhua = dianhua[0].text.split('：')[1]
shouji = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(4)')
shouji = shouji[0].text.split('：')[1]
chuanzhen = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(5)')
chuanzhen = chuanzhen[0].text.split('：')[1]
product = soup.select('tr:nth-child(1) > td:nth-child(2)')
product = product[0].text
company_type = soup.select('tr:nth-child(2) > td:nth-child(2) > span')
company_type = company_type[0].text.strip()
legal_person = soup.select('tr:nth-child(3) > td:nth-child(2)')
legal_person = legal_person[0].text
main_address = soup.select('tr:nth-child(5) > td:nth-child(2) > span')
main_address = main_address[0].text
brand = soup.select('tr:nth-child(6) > td:nth-child(2) > span')
brand = brand[0].text
area = soup.select('tr:nth-child(9) > td:nth-child(2) > span')
area = area[0].text
industry = soup.select('tr:nth-child(1) > td:nth-child(4)')
industry = industry[0].text.strip()
address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)')
address = address[0].text.split('：')[1]
jingying = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(8)')
jingying = jingying[0].text.split('：')[1]
date = soup.select('tr:nth-child(5) > td:nth-child(4) > span')
date = date[0].text
wangzhi = soup.select('tr:nth-child(12) > td:nth-child(4) > p > span > a')
wangzhi = wangzhi[0].text
data = [company, date, name, legal_person, shouji, dianhua, chuanzhen, company_type, jingying, industry,
product, wangzhi, area, brand, address, main_address] # 将以上数据放入列表中打印在命令框
print(data)
with open('服装1.csv', 'a', newline='', encoding='GB2312') as csvfile:
w1 = csv.writer(csvfile)
w1.writerow(data)
except:
with open('服装2.csv', 'a', newline='', encoding='utf-8-sig') as csvfile:
w1 = csv.writer(csvfile)
w1.writerow(data)
print('utf解码成功')
# 利用并发加速爬取，最大线程为50个，本文章中一共有50个网站，可以加入50个线程
# 建立一个加速器对象，线程数每个网站都不同，太大网站接受不了会造成数据损失
executor = ThreadPoolExecutor(max_workers=10)
# submit()的参数：第一个为函数，之后为该函数的传入参数，允许有多个
future_tasks = [executor.submit(parser, url) for url in wzs1]
# 等待所有的线程完成，才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
print('全部信息抓取完毕')

Tags：python 点击:() 评论:()

声明：本站部分内容及图片来自互联网,转载是出于传递更多信息之目的,内容观点仅代表作者本人,如有任何标注错误或版权侵犯请与我们联系(Email:2595517585@qq.com)，我们将及时更正、删除，谢谢。

▌相关推荐

Python4要来了？快来看看Python之父怎么说

大家好，我是菜鸟哥，今天跟大家一起聊一下Python4的话题！从2020年的1月1号开始，Python官方正式的停止了对于Python2的维护。Python也正式的进入了Python3的时代。而随着时间的...【详细内容】

2021-12-28　　Tags: python 点击:(1)　　评论:(0)　　加入收藏

Python如何构建自动在线刷视频

学习Python的初衷是因为它的实践的便捷性，几乎计算机上能完成的各种操作都能在Python上找到解决途径。平时工作需要在线学习。而在线学习的复杂性经常让人抓狂。费时费力且效...【详细内容】

2021-12-28　　Tags: python 点击:(1)　　评论:(0)　　加入收藏

非常实用的 Python 库，推一次火一次

Python 是一个很棒的语言。它是世界上发展最快的编程语言之一。它一次又一次地证明了在开发人员职位中和跨行业的数据科学职位中的实用性。整个 Python 及其库的生态系统使...【详细内容】

2021-12-27　　Tags: python 点击:(2)　　评论:(0)　　加入收藏

Python中的菜单驱动程序

菜单驱动程序简介菜单驱动程序是通过显示选项列表从用户那里获取输入并允许用户从选项列表中选择输入的程序。菜单驱动程序的一个简单示例是 ATM（自动取款机）。在交易的情况下...【详细内容】

2021-12-27　　Tags: python 点击:(4)　　评论:(0)　　加入收藏

FastAPI - 一款新型的 Python Web 框架(对比 Flask)

近日只是为了想尽办法为 Flask 实现 Swagger UI 文档功能，基本上要让 Flask 配合 Flasgger, 所以写了篇 Flask 应用集成 Swagger UI 。然而不断的 Google 过程中偶然间发现了...【详细内容】

2021-12-23　　Tags: python 点击:(6)　　评论:(0)　　加入收藏

15个Python入门小程序，你都知道哪些

有不少同学学完Python后仍然很难将其灵活运用。我整理15个Python入门的小程序。在实践中应用Python会有事半功倍的效果。01 实现二元二次函数实现数学里的二元二次函数：f(x,...【详细内容】

2021-12-22　　Tags: python 点击:(32)　　评论:(0)　　加入收藏

用Python提取Verilog网表层次和实例化关系

Verilog是由一个个module组成的，下面是其中一个module在网表中的样子，我只需要提取module名字、实例化关系。module rst_filter ( ...); 端口声明... wire定义......【详细内容】

2021-12-22　　Tags: python 点击:(9)　　评论:(0)　　加入收藏

使用 Python 将 MP4视频转换为GIF动画

运行环境如何从 MP4 视频中提取帧将帧变成 GIF 创建 MP4 到 GIF GUI ...【详细内容】

2021-12-22　　Tags: python 点击:(6)　　评论:(0)　　加入收藏

python的面向对象编程

面向对象：Object Oriented Programming，简称OOP，即面向对象程序设计。类(Class)和对象(Object)类是用来描述具有相同属性和方法对象的集合。对象是类的具体实例。比如，学生都有...【详细内容】

2021-12-22　　Tags: python 点击:(9)　　评论:(0)　　加入收藏

python初学者必须吃透的这些内置函数

所谓内置函数，就是Python提供的, 可以直接拿来直接用的函数，比如大家熟悉的print，range、input等，也有不是很熟，但是很重要的，如enumerate、zip、join等，Python内置的这些函数非常...【详细内容】

2021-12-21　　Tags: python 点击:(5)　　评论:(0)　　加入收藏

▌简易百科推荐

Python4要来了？快来看看Python之父怎么说

2021-12-28　　菜鸟学python　　　　Tags:Python4 　点击:(1)　　评论:(0)　　加入收藏

Python如何构建自动在线刷视频

2021-12-28　　风度翩翩的Python　　　　Tags:Python 　点击:(1)　　评论:(0)　　加入收藏

非常实用的 Python 库，推一次火一次

2021-12-27　　IT资料库　　　　Tags:Python 库　点击:(2)　　评论:(0)　　加入收藏

Python中的菜单驱动程序

2021-12-27　　子冉爱python　　　　Tags:Python 　点击:(4)　　评论:(0)　　加入收藏

15个Python入门小程序，你都知道哪些

2021-12-22　　程序汪小成　　　　Tags:Python入门　点击:(32)　　评论:(0)　　加入收藏

用Python提取Verilog网表层次和实例化关系

2021-12-22　　编程啊青　　　　Tags:Verilog 　点击:(9)　　评论:(0)　　加入收藏

使用 Python 将 MP4视频转换为GIF动画

运行环境如何从 MP4 视频中提取帧将帧变成 GIF 创建 MP4 到 GIF GUI ...【详细内容】

2021-12-22　　修道猿　　　　Tags:Python 　点击:(6)　　评论:(0)　　加入收藏

python的面向对象编程

2021-12-22　　我头秃了　　　　Tags:python 　点击:(9)　　评论:(0)　　加入收藏

python初学者必须吃透的这些内置函数

2021-12-21　　程序员小新ds　　　　Tags:python初　点击:(5)　　评论:(0)　　加入收藏

Python实现各种加密，接口加解密不说难

Hi，大家好。我们在接口自动化测试项目中，有时候需要一些加密。今天给大伙介绍Python实现各种加密，接口加解密再也不愁。目录一、项目加解密需求分析六、Python加密库PyCrypto...【详细内容】

2021-12-21　　Python可乐　　　　Tags:Python 　点击:(8)　　评论:(0)　　加入收藏

推荐资讯

聊聊如何自定义数据脱	河南人到底有多爱吃面
人称“犬中四煞”的4	离婚后，约定每月给孩子
“三皇五帝”分别是哪	印度低种姓群体如何翻
日本研发“飞行摩托”	2021年Steam最畅销游