您好,登錄后才能下訂單哦!
這里使用nodejs下的chrome-har庫來導出瀏覽器的har數據,經驗證效果不錯,比較靠譜。
//cnpm install --save log4js
const log4js = require('log4js');
const options = {
appenders:{
console:{
type: "console"
},
"puppeteer-record":{
type : 'dateFile',
filename : 'logs/puppeteer/log',
pattern : '-yyyy-MM-dd.log',
alwaysIncludePattern : true,
encoding : 'utf-8'
},
"puppeteer-har-record":{
type : 'dateFile',
filename : 'logs/puppeteerhar/log',
pattern : '-yyyy-MM-dd.log',
alwaysIncludePattern : true,
encoding : 'utf-8'
},
"puppeteer-harevent-record":{
type : 'dateFile',
filename : 'logs/puppeteerharevent/log',
pattern : '-yyyy-MM-dd.log',
alwaysIncludePattern : true,
encoding : 'utf-8'
}
} ,
"categories": {
"default": { "appenders": ['console', "puppeteer-record", "puppeteer-har-record","puppeteer-harevent-record"], "level": "all" }
}
}
log4js.configure(options);
function getConsoleLogger(){
let consoleLog = log4js.getLogger('console');
return consoleLog ;
}
function getPuppeteerRecordLogger(){
let consoleLog = log4js.getLogger('puppeteer-record');
return consoleLog ;
}
function getPuppeteerHarRecordLogger(){
let consoleLog = log4js.getLogger('puppeteer-har-record');
return consoleLog ;
}
function getPuppeteerHarEventRecordLogger(){
let consoleLog = log4js.getLogger('puppeteer-harevent-record');
return consoleLog ;
}
exports.getConsoleLogger = getConsoleLogger;
exports.getPuppeteerRecordLogger = getPuppeteerRecordLogger;
exports.getPuppeteerHarRecordLogger = getPuppeteerHarRecordLogger;
exports.getPuppeteerHarEventRecordLogger = getPuppeteerHarEventRecordLogger;
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
const path = require("path");
const logger=require("./log");
const grpcclient=require("./grpcclient");
const log = logger.getPuppeteerHarRecordLogger() ;
/*
啟動瀏覽器
*/
async function launchBrowser(){
//啟動瀏覽器實例 [puppeteer.createBrowserFetcher([options])]
let browser = await puppeteer.launch({
// 若是手動下載的chromium需要指定chromium地址, 默認引用地址為 /項目目錄/node_modules/puppeteer/.local-chromium/
//executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',
//如果是訪問https頁面 此屬性會忽略https錯誤
ignoreHTTPSErrors: true,
// 關閉headless模式, 不會打開瀏覽器
headless: true,
//瀏覽器啟動參數 https://peter.sh/experiments/chromium-command-line-switches/ --timeout
args:['--disk-cache-size=0','--disable-cache','--disable-infobars','--window-size=800,600','--ignore-certificate-errors','--enable-feaures'],
//是否為每個選項卡自動打開DevTools面板。 如果此選項為true,則headless選項將設置為false。
devtools: false,
//Defaults to 30000 (30 seconds). Pass 0 to disable timeout.
timeout: 0
//放慢puppeteer執行的動作,方便調試
//slowMo: 250
});
return browser ;
}
async function saveHarlog(url,dirPath,filename){
let homesite = url ;
//保存的文件路徑
let harFilePath = path.join(dirPath,filename) ;
//處理URL
if(!(url.startsWith('http://') || url.startsWith('https://'))){
url = "http://" + url ;
}
//打開瀏覽器
let browser = await launchBrowser() ;
//Puppeteer 初始化的屏幕大小默認為 800px x 600px。但是這個尺寸可以通過 Page.setViewport() 設置。
/*
await page.setViewport({
width: 800,
height: 600
});
*/
//創建一個新頁面
//let page = await browser.newPage();
const page = (await browser.pages())[0];
await page.waitFor(1000); //delay 1 s
//page.setDefaultTimeout(12000);
//page.setJavaScriptEnabled(enabled)
//事件監聽輕松打出頁面的log
//page.on('console', msg => log.info('PAGE LOG:', msg.text()));
let har = new PuppeteerHar(page);
try{
await har.start({ path:harFilePath});
/*
頁面跳轉相關函數:
page.goto(url, options) //相當于在瀏覽器中輸入了地址,然后回車
page.goBack(options)
page.goForward(options)
page.reload(options)
*/
await page.goto(url,{
timeout:0
});
log.info(page.mainFrame().title());
log.info(page.mainFrame().url());
//返回HTML文檔內容
//const html = await page.$eval('html', e => e.outerHTML);
//const html = await page.content() ;
//通知JAVA解析HAR文件
/*
try{
grpcclient.resovleHarLog({
url:homesite,
file_name:filename,
file_dir:dirPath,
context:''
});
}catch(err){
log.error('發送RPC請求失敗,' + err);
}
*/
}catch(error){
log.info('resovle error :' + url + "; error message:" + error) ;
}finally{
if(har){
await har.stop();
}
if(browser){
await browser.close();
}
}
}
exports.launchBrowser = launchBrowser;
exports.saveHarlog = saveHarlog;
const fs = require("fs");
const path = require("path");
const moment = require("moment");
const schedule = require('node-schedule');
const cvsresovler=require("./module/cvsresovle");
const mhar=require("./module/puppeteerhar");
/*
cnpm install --save moment
cnpm install --save csv
cnpm install --save node-schedule
cnpm install --save puppeteer
cnpm install --save puppeteer-har
cnpm install --save iconv-lite
cnpm install --save chrome-har
cnpm install --save grpc
*/
function init(){
console.log('初始化調度器') ;
//每分鐘的第30秒定時執行一次:
schedule.scheduleJob('0 14 10 * * *',()=>{
let ftime = moment().format('YYYYMMDDHHmm');
console.log('當前調度時間為:' + ftime) ;
let dirPath = path.join(__dirname,'harlogs',ftime) ;
console.log("創建目錄:" + dirPath) ;
let isExist = false ;
if(fs.existsSync(dirPath)){
//創建文件夾
let stat = fs.lstatSync(dirPath);
if(stat.isDirectory()){
isExist = true ;
}
}
if(!isExist){
//創建文件夾
console.log("創建文件夾" + ftime) ;
fs.mkdirSync(dirPath);
}
//開始解析需要處理的URL
let dataArr = cvsresovler.readUrlRecord(path.join(__dirname,'top300.csv')) ;
console.log("解析出URL共計" + dataArr.length + "條") ;
/*
開始抓取HAR數據【同步的方式執行】。
注意:如果這里直接通過for循環遍歷dataArr并調用saveHarlog方法,那么這將是一個異步的過程。
*/
(async function iterator(i){
let data = dataArr[i]
let url = data['SITE_LINK'] ;
url = url.trim() ;
let filename = url.replace(/\//g,'-').replace(/\\/g,'-') + '.har' ;
if(url){
console.log((i+1) + "-starting to resovle url :" + url ) ;
try{
await mhar.saveHarlog(url,dirPath,"N" + "-" + filename) ;
}catch(error){
console.log(error) ;
}
}
if(i + 1 < dataArr.length){
iterator(i+1) ;
}
})(0) ;
});
console.log('應用程序啟動完成') ;
}
//執行
//init();
/**
用于測試的方法
*/
async function test(){
let ftime = moment().format('YYYYMMDDHHmm');
console.log('當前執行時間為:' + ftime) ;
let dirPath = path.join(__dirname,'harlogs',ftime) ;
console.log("創建目錄:" + dirPath) ;
let isExist = false ;
if(fs.existsSync(dirPath)){
//創建文件夾
let stat = fs.lstatSync(dirPath);
if(stat.isDirectory()){
isExist = true ;
}
}
if(!isExist){
//創建文件夾
console.log("創建文件夾" + ftime) ;
fs.mkdirSync(dirPath);
}
//測試的URL
let url = "www.baidu.com" ;
let arguments = process.argv.splice(2);
if(arguments.length > 0 ){
url = arguments[0] ;
}
url = url.trim() ;
let filename = url.replace(/\//g,'-').replace(/\\/g,'-') + '.har' ;
if(url){
console.log("starting to resovle test url :" + url ) ;
try{
await mhar.saveHarlog(url,dirPath,"NT" + "-" + filename) ;
}catch(error){
console.log(error) ;
}
}
}
//運行測試
test() ;
關于GRPC部分的代碼,請參考我另外一篇博文
參考地址:https://michaljanaszek.com/blog/generate-har-with-puppeteer
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。