您好,登錄后才能下訂單哦!
視頻課程鏈接:http://edu.51cto.com/course/14870.html
? 爬蟲,稱為網頁蜘蛛或網絡機器人,用于自動獲(爬)取互聯網上的信息,本質上就是一段代碼
? 任何一門高級開發語言都可以實現爬蟲,并不只有Python
? 通過代碼,模擬瀏覽器向服務器發送HTTP或HTTPS請求,然后對服務器響應的結果進行處理,從中獲取想要的數據
? 三步走:
? 使用urllib模塊模擬瀏覽器發送請求
# 獲取數據
def get_data():
url = 'https://search.51job.com/list/070200,000000,0000,00,9,99,java%25E5%25BC%2580%25E5%258F%2591,2,1.html'
# 創建Request對象,指定url和請求頭
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
# print(type(response)) # HTTPResponse類型
# print(response.getcode()) # 響應狀態碼
# print(response.info())
if response.getcode() == 200:
data = response.read() # 讀取響應結果
# print(type(data)) # bytes類型
data = str(data, encoding='gbk') # 轉換為str
# print(data)
# 將數據寫入文件中
with open('index.html', mode='w', encoding='gbk') as f:
f.write(data)
? 三種方式:
字符串解析
使用字符串+正則表達式
使用XPath
XPath是一門在XML文檔中查找信息的語言,用來在XML文檔中對元素和屬性進行遍歷。
使用Chrome瀏覽器的開發人員工具,獲取XPath
使用第三方模塊BeautifulSoup
Beautiful Soup 是一個可以從HTML或XML文件中提取數據的Python庫
安裝pip install beautifulsoup4
# 處理數據
def parse_data():
with open('index.html', mode='r', encoding='gbk') as f:
html = f.read()
# 創建BeautifulSoup實例,解析html數據
bs = BeautifulSoup(html, 'html.parser') # 指定使用html解析器parser
'''
查找數據
'''
# 1.find()方法,獲取第一個匹配的標簽
# div = bs.find('div')
# print(div)
# print(type(div)) # Tag類型
# 2.find_all()方法,獲取所有匹配的標簽
# metas = bs.find_all('meta') # 返回的是集合
# print(metas[0])
# print(bs.find_all(id='hello')) # 根據id獲取,返回的是集合
# print(bs.find_all(class_='itany')) # 根據class獲取
# 3.select()方法,使用CSS選擇器來獲取元素
# print(bs.select('#hello'))
# print(bs.select('.itany'))
# print(bs.select('p#world span'))
# print(bs.select('[title]'))
# 4.get_text()方法,獲取Tag中的文本
# value = bs.select('#hello')[0].get_text(strip=True)
# print(len(value))
# print(value)
# 獲取職位信息
divs = bs.select('#resultList .el')
result = []
for div in divs[1:]:
title = div.select('.t1')[0].get_text(strip=True)
company = div.select('.t2')[0].get_text(strip=True)
addr = div.select('.t3')[0].get_text(strip=True)
salary = div.select('.t4')[0].get_text(strip=True)
pubDate = div.select('.t5')[0].get_text(strip=True)
# print(title, company, addr, salary, pubDate)
row = {
'title': title,
'company': company,
'addr': addr,
'salary': salary,
'pubDate': pubDate
}
result.append(row)
return result
# 存儲數據到MySQL
def save_to_mysql(data):
config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': '',
'database': 'python',
'charset': 'utf8'
}
conn = pymysql.connect(**config)
cursor = conn.cursor()
sql = '''
insert into t_job
(title, company, addr, salary, pubDate)
values
(%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubDate)s)
'''
cursor.executemany(sql, data)
conn.commit()
cursor.close()
conn.close()
? 使用openpyxl模塊操作Excel
? 安裝openpyxl:pip install openpyxl
? 工作薄Workbook
? 工作表Sheet
? 單元格Cell
# 存儲數據到Excel
def save_to_excel(data):
# 創建工作薄Workbook
book = Workbook()
# 創建工作表Sheet
sheet = book.create_sheet('南京Java招聘信息', 0)
# 向工作表中添加數據
sheet.append(['職位名', '公司名', '工作地點', '薪資', '發布時間'])
for item in data:
row = [item['title'], item['company'], item['addr'], item['salary'], item['pubDate']]
sheet.append(row)
# 輸出保存
book.save('51job.xlsx')
? 安裝redis庫:pip install redis
# 存儲數據到Redis
def save_to_redis(data):
config = {
'host': '192.168.2.30',
'port': 6379,
'charset': 'utf8'
}
r = redis.Redis(**config)
# r.set('name', 'tom')
for item in data:
r.lpush('jobs', item)
# 從Redis中讀取數據
def read_from_redis():
config = {
'host': '192.168.2.30',
'port': 6379,
'charset': 'utf8',
'decode_responses': True # 讀取時解碼
}
r = redis.Redis(**config)
print(r.lrange('jobs', 0, -1))
from urllib import request
import json
def get_data():
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=400&page_start=0'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
if response.getcode() == 200:
result = response.read()
# print(type(result)) # bytes類型
return result
def parse_data(html):
# 將字符串形式的json轉換為dict字典
data = json.loads(html)
# print(type(data), data)
movies = data['subjects']
for movie in movies:
print(movie['title'], movie['rate'])
if __name__ == '__main__':
parse_data(get_data())
? 步驟:
from urllib import request
import json
from datetime import datetime, timedelta
import time
# 獲取數據
def get_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
if response.getcode() == 200:
return response.read()
# 處理數據
def parse_data(html):
data = json.loads(html)['cmts']
comments = []
for item in data:
comment = {
'id': item['id'],
'nickName': item['nickName'],
'cityName': item['cityName'] if 'cityName' in item else '', # 處理cityName不存在情況
'content': item['content'].replace('\n', ' '), # 處理評論內容換行的情況
'score': item['score'],
'startTime': item['startTime']
}
comments.append(comment)
return comments
# 存儲數據到文本文件
def save_to_txt():
start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 當前時間
end_time = '2018-08-10 00:00:00' # 結束時間
while start_time > end_time:
url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=0&startTime=' + start_time.replace(
' ', '%20')
try:
html = get_data(url)
except:
time.sleep(1)
html = get_data(url)
else:
time.sleep(0.1)
comments = parse_data(html)
print(comments)
start_time = comments[14]['startTime'] # 末尾評論時間
start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') - timedelta(seconds=1) # 向前減1秒,防止獲取到重復數據
start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S')
for item in comments:
with open('comments.txt', mode='a', encoding='utf-8') as f:
f.write(str(item['id']) + ',' + item['nickName'] + ',' + item['cityName'] + ',' + item[
'content'] + ',' + str(item['score']) + ',' + item['startTime'] + '\n')
if __name__ == '__main__':
# url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=15&startTime=2018-09-01%2011%3A10%3A00'
# comments = parse_data(get_data(url))
# print(comments)
save_to_txt()
? pyecharts類庫
from collections import Counter
from pyecharts import Geo
import json
from pyecharts import Bar
def render():
# 獲取所有城市信息
cities = []
with open('comments.txt', mode='r', encoding='utf-8') as f:
rows = f.readlines()
for row in rows:
city = row.split(',')[2]
if city != '':
cities.append(city)
# 對城市數據和坐標文件中的地名進行處理
handle(cities)
# 統計每個城市出現的次數
# data = [] # [('南京',25),('北京',59)]
# for city in set(cities):
# data.append((city, cities.count(city)))
data = Counter(cities).most_common()
# 根據城市數據生成地理坐標圖
geo = Geo(
"《一出好戲》粉絲位置分布",
"數據來源:貓眼",
title_color="#fff",
title_pos="center",
width=1200,
height=600,
background_color="#404a59",
)
attr, value = geo.cast(data)
geo.add(
"",
attr,
value,
visual_range=[0, 3500],
visual_text_color="#fff",
symbol_size=15,
is_visualmap=True,
)
geo.render('粉絲位置分布.html')
# 根據城市數據生成柱狀圖
cities_top20 = Counter(cities).most_common(20) # 返回出現次數最多的20條
bar = Bar("《一出好戲》粉絲來源排行榜TOP20", '數據來源:貓眼', title_pos='center', width=1200, height=600)
attr, value = bar.cast(cities_top20)
bar.add("", attr, value)
bar.render('粉絲來源排行榜-柱狀圖.html')
# 處理地名數據,解析坐標文件中找不到地名的問題
def handle(cities):
with open(
'C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json',
mode='r', encoding='utf-8') as f:
data = json.loads(f.read()) # 將str轉換為dict
# 循環判斷處理
data_new = data.copy() # 復制一份地名數據
for city in set(cities):
count = 0
for k in data:
count += 1
if k == city:
break
if k.startswith(city): # 處理簡寫的地名,如南京市 簡寫為 南京
data_new[city] = data[k]
break
if k.startswith(city[0:-1]) and len(city) >= 3: # 處理行政變更的地名,如溧水縣 改為 溧水區
data_new[city] = data[k]
break
# 處理不存在的情況
if count == len(data):
while city in cities:
cities.remove(city)
# print(len(data), len(data_new))
# 寫入覆蓋坐標文件
with open(
'C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json',
mode='w', encoding='utf-8') as f:
f.write(json.dumps(data_new, ensure_ascii=False)) # 將dict轉換為str,指定ensure_ascii=False支持中文
if __name__ == '__main__':
render()
from pyecharts import Pie
# 獲取評論中所有評分
rates = []
with open('comments.txt', mode='r', encoding='utf-8') as f:
rows = f.readlines()
for row in rows:
rates.append(row.split(',')[4])
# print(rates)
# 定義星級
attr = ['五星', '四星', '三星', '二星', '一星']
value = [
rates.count('5') + rates.count('4.5'),
rates.count('4') + rates.count('3.5'),
rates.count('3') + rates.count('2.5'),
rates.count('2') + rates.count('1.5'),
rates.count('1') + rates.count('0.5')
]
# print(value)
pie = Pie("《一出好戲》評分星級", title_pos='center', width=900)
pie.add("", attr, value, is_label_show=True, is_legend_show=False)
pie.render('電影評分-餅圖.html')
? jieba(結巴)是一個強大的分詞庫,完美支持中文分詞
? Matplotlib 是一個Python的 2D繪圖庫,可以生成繪圖,直方圖,功率譜,條形圖,錯誤圖,散點圖等
? wordcloud基于Python的詞云生成類庫,很好用,而且功能強大
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
# 獲取所有評論內容
comments = []
with open('comments.txt', mode='r', encoding='utf-8') as f:
rows = f.readlines()
for row in rows:
comment = row.split(',')[3]
if comment != '':
comments.append(comment)
# 設置分詞
comment_after_split = jieba.cut(str(comments), cut_all=False)
words = ' '.join(comment_after_split) # 以空格進行拼接
# print(words)
# 設置屏蔽詞匯
stopwords = STOPWORDS.copy()
stopwords.add('電影')
stopwords.add('一出')
stopwords.add('好戲')
stopwords.add('有點')
# 導入背景圖
bg_image = plt.imread('love.jpg')
# 設置詞云參數
wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, stopwords=stopwords, max_font_size=400,
random_state=50,font_path='STKAITI.TTF')
# 將分詞后數據導入云圖
wc.generate_from_text(words)
# 繪制圖像
plt.imshow(wc)
plt.axis('off') # 不顯示坐標軸
plt.show() # 顯示圖像
# 保存圖像到文件
wc.to_file('詞云圖.jpg')
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。