使用selenium多进程爬虫,bilibili爬虫

使用selenium实现bilibili多进程爬虫

利用多进程爬取哔哩哔哩的数据

#代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# 注意修改无头浏览器的地址,该装的库要装
# 数据将会写入EXCEL文件里,所以.csv文件请提前准备好

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import csv
from multiprocessing import Pool
import os, time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random

# 子进程,主要代码都写在这里
def scrawler_process(process_start, num, sum, pace):
print('Run task %s (%s)...' % (num, os.getpid()))
start = time.time()

# 修改请求头,反爬虫
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
)
service_args = []
# 关闭图片加载有蜜汁BUG
#service_args.append('--load-images=no') ##关闭图片加载
service_args.append('--disk-cache=yes') ##开启缓存
service_args.append('--ignore-ssl-errors=true') ##忽略https错误
phantom_path = 'phantomjs\\bin\\phantomjs.exe'
driver = webdriver.PhantomJS(executable_path=phantom_path, service_args=service_args)

#已爬取的有效数据量
counter = 0
#urlNumber
urlNumber = process_start + num
#爬取连续失败的次数
fail_num = 0

#写入的EXCEL文件名
filename = 'bilibili-%d.csv' % (num)
out = open(filename, "a", newline="", encoding="utf-8")
csv_writer = csv.writer(out, dialect="excel")

while (counter < sum and urlNumber < 272500000):
# 随机选取uid
url = "https://space.bilibili.com/%d#/dynamic" % (urlNumber + random.randint(-1249, +1249)) # 要爬取的地址
driver.get(url)
print(url)
# bsObj = BeautifulSoup(driver.page_source)
# print(bsObj.prettify())

try:
#判定网页是否加载完全
element = WebDriverWait(driver, 2).until(
EC.presence_of_element_located((By.CLASS_NAME, "content")))
span = driver.find_element_by_id("h-name")

#获取性别的元素并处理
sexSpanClass = driver.find_element_by_id("h-gender").get_attribute("class").split(" ")
if len(sexSpanClass) == 3:
sex = sexSpanClass[2]
else:
sex = "未填写"

# 获取等级的元素并处理
level = driver.find_element_by_css_selector(
"#space-body > div.h > div.wrapper > div.h-inner > div.h-user > div > div.h-basic > div:nth-child(1) > a.h-level.m-level").get_attribute(
"lvl")
uid = urlNumber

# 获取注册时间的元素并处理
regtime = driver.find_element_by_class_name("regtime").find_element_by_class_name("text")
regtime_text = regtime.text
if (regtime_text == ''):
regtime_text = "未填写"

# 获取生日的元素并处理
birthday = driver.find_element_by_class_name("birthday").find_element_by_class_name("text")
birthday_text = birthday.text
if (birthday_text == ''):
birthday_text = "未填写"

# 获取地理位置的元素并处理
geo = driver.find_element_by_class_name("geo").find_element_by_class_name("text")
geo_text = geo.text
if (geo.text == ''):
geo_text = "未填写"

# 获取粉丝数的元素并处理
fan_num = driver.find_element_by_id("n-fs")
fan_num_text = fan_num.text
if (fan_num_text[-1] == "万"):
fan_num_text = float(fan_num_text[:-1]) * 10000

# 以span是否存在作为网页是否加载成功的依据
if (span != None):
nickname = span.text
print(nickname, sex, level, uid, regtime_text[3:].strip(), birthday_text, geo_text, fan_num_text)
row = [nickname, sex, level, uid, regtime_text[3:].strip(), birthday_text, geo_text, fan_num_text]
csv_writer.writerow(row)

# print(bsObj.find(id="h-name").get_text())
urlNumber += pace
counter += 1
fail_num = 0
driver.get("http://about:blank")
except TimeoutException:
urlNumber += pace
fail_num += 1
#如果连续失败三次,说明被反爬虫或大片的uid不存在
if(fail_num > 2):
driver.get("https://www.bilibili.com/")
print('sleep 30s')
#缓一缓歇会儿再爬
time.sleep(30)
#连续失败次数清零,重新计数
fail_num = 0
#跳过不存在的uid
urlNumber += 250000
#PHANTOMJS可能存在一种BUG,多进程爬取时网页的信息会弄串了,访问空白页可以重置
driver.get("http://about:blank")

driver.close()

end = time.time()
print('Task %s runs %0.2f seconds.' % (num, (end - start)))

# 主程序
if __name__=='__main__':
# 记录开始时间
start_time = time.time()

#从哪一条id开始爬取
crawler_start = 0
#进程数
crawler_num = 1
#每个进程要爬取的有效数据量
crawler_sum = 10000
#隔多少id爬取一次
crawler_pace = 2500
print('Parent process %s.' % os.getpid())
#进程池
p = Pool(crawler_num)
#启动进程
for i in range(crawler_num):
p.apply_async(scrawler_process, args=(crawler_start, i, crawler_sum, crawler_pace,))
print('Waiting for all subprocesses done...')
p.close()
p.join()
print('All subprocesses done.')

end_time = time.time()
print(end_time - start_time)