python爬蟲是指對指定url中獲取對我們有用的數據信息,通過代碼實現數據的大量獲取,只要你能通過瀏覽器訪問的數據都可以通過爬蟲獲取。
實例:
爬取圖片并下載,先準備好以下幾點。
pip install requests
pip install BeautifulSoup4
pip install lxml
實現代碼:
import os
import re
from uuid import uuid1
import requests
from bs4 import BeautifulSoup
from random import choice
# 獲取隨機請求頭
def get_headers():
file = open('user_agent.txt', 'r')
user_agent_list = file.readlines()
user_agent = str(choice(user_agent_list)).replace('\n', '')
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0' if len(
user_agent) < 10 else user_agent
headers = {
"User-Agent": user_agent,
}
return headers
# 負責下載圖片
def download(src, end):
try:
headers = get_headers()
response = requests.get(src, headers=headers)
# 獲取的文本實際上是圖片的二進制文本
img = response.content
print(img)
path = "images/" + str(uuid1()) + end
# 將他拷貝到本地文件 w 寫 b 二進制 wb代表寫入二進制文本
with open(path, 'wb') as f:
f.write(img)
except Exception as e:
pass
# 負責請求頁面
def requests_get(url):
try:
headers = get_headers()
# 請求頁面
response = requests.get(url, headers=headers)
# 解析
soup = BeautifulSoup(response.text, 'lxml')
image_list = soup.find_all(attrs={"class": "img-responsive"})
for image in image_list[:-1]:
# 獲取圖片鏈接
src = image.attrs["data-backup"]
# 獲取圖片后綴
end = os.path.splitext(src)[1]
if src and end:
# 去除特殊字符
end = re.sub(r'[,。??,/\\·]', '', end)
# 調用下載函數
download(src, end)
else:
pass
except Exception as e:
print(e)
pass
if __name__ == '__main__':
# 負責翻頁
for page in range(1, 5):
url = 'https://www.doutula.com/photo/list/?page=%d' % page
requests_get(url)