踩坑记录
这个个人博客是我无意中发现的,经过Ping测试服务器应该是在海外,所以传输速度并不理想,为了减轻博主服务器的压力,在本文中进行脱敏处理,如果想要查看网址,可以点击程序或网站自行查看。
根据博客的分类,分成四类来爬取,其中最多的一类照片有2886张,这也是这次爬取中踩坑最多的一类。
共爬取4513张图片,耗时 117min 37sec
以图片最多的一类为例子记录一下。
过程分为:
- 获取每篇博客的链接
- 获取博客中每张图片的链接
- 获取每张图片的二进制信息
- 保存每张图片
1.获取每篇博客的链接
post_href = []#博客链接
def get_href():
for i in range(1,6):#共5页
if i==1:
link = url
else:
link = url + '/page/' + str(i)
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,'lxml')
href_list = soup.find_all('h2',class_='entry-title')
for each in href_list:
href = each.find('a')['href']
post_href.append(href)
2.获取博客中每张图片的链接
img_url = []#图片url
def get_imgURL():
for href in post_href:
r = requests.get(href,headers=headers)
test = r.text
reg = r'src="(.+?\.jpg)" alt'
imgre = re.compile(reg)
imgURL = re.findall(imgre,test)#返回的是列表格式
img_url.append(imgURL)#元素为列表
其中利用了正则来寻找所有符合条件的url
3.获取每张图片的二进制信息并保存
def get_img():
number = 1
for url_list in img_url:
for url in url_list:
headers['User-Agent'] = random.choice(user_agent_list)
r = requests.get(url,headers=headers,verify=False)
content = r.content
file_name = dirname + '/' + str(number) + '.jpg'
with open(file_name,'wb') as file:
file.write(content)
print('保存第%d张'% (number))
number+=1
踩坑
1.
HTTPSConnectionPool(host='***', port=443)
方法:加上参数verify=False
response = requests.get(url,verify=False)
2.
requests.exceptions.ConnectionError: (‘Connection aborted.’, RemoteDisconnected(‘Remote end closed connection without response’,))
方法:随机切换User-Agent
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.62 Safari/537.36 Edg/81.0.416.31',
'Referer': '***'
}
headers['User-Agent'] = random.choice(user_agent_list)
完整代码
"""
Author:JMbaozi
Total:2886
Take:48min 08sec
"""
import requests
import re
import lxml
import random
from bs4 import BeautifulSoup
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.62 Safari/537.36 Edg/81.0.416.31',
'Referer': '***'
}
url = '***'
post_href = []#博客链接
img_url = []#图片url
dirname = 'heisiPicture'
#获取链接
def get_href():
for i in range(1,6):#共5页
if i==1:
link = url
else:
link = url + '/page/' + str(i)
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,'lxml')
href_list = soup.find_all('h2',class_='entry-title')
for each in href_list:
href = each.find('a')['href']
post_href.append(href)
#获取照片url
def get_imgURL():
for href in post_href:
r = requests.get(href,headers=headers)
test = r.text
reg = r'src="(.+?\.jpg)" alt'
imgre = re.compile(reg)
imgURL = re.findall(imgre,test)#返回的是列表格式
img_url.append(imgURL)#元素为列表
#获取照片二进制信息并保存
def get_img():
number = 1
for url_list in img_url:
for url in url_list:
headers['User-Agent'] = random.choice(user_agent_list)
r = requests.get(url,headers=headers,verify=False)
content = r.content
file_name = dirname + '/' + str(number) + '.jpg'
with open(file_name,'wb') as file:
file.write(content)
print('保存第%d张'% (number))
number+=1
if __name__ == '__main__':
get_href()
get_imgURL()
get_img()
print('保存完成!')