forked from github/lightcrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Drew McMillan
committed
Sep 3, 2018
1 parent
669ccc9
commit 0e970df
Showing
11 changed files
with
2,483 additions
and
229 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,11 @@ | ||
{ | ||
"url": "https://www.example.com", | ||
"maxDepth": 1, | ||
"maxChromeInstances": 5, | ||
"limit": "/music/", | ||
"url": "https://www.bbc.co.uk/news", | ||
"maxDepth": 2, | ||
"maxChromeInstances": 10, | ||
"limit": "/news/", | ||
"httpsOnly": true, | ||
"showHttpLinksDuring": false, | ||
"showHttpLinksAfter": true, | ||
"userAgent": "light-mc-crawler Mixed Content Crawler" | ||
"chromeFlags": ["--show-paint-rects", "--no-sandbox", "--user-data-dir", "--headless", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage"], | ||
"userAgent": "light-mc-crawler - News Discovery - Mixed Content Crawler" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,244 +1,53 @@ | ||
const cheerio = require('cheerio') | ||
const ChildProcess = require('child_process') | ||
const Crawler = require('simplecrawler') | ||
const path = require('path') | ||
const queue = require('async/queue') | ||
const fs = require('fs') | ||
const colors = require('colors') | ||
const util = require('util') | ||
|
||
const stats = { | ||
const cheerio = require('cheerio'), | ||
queue = require('./src/queue'), | ||
path = require('path'), | ||
fs = require('fs'), | ||
Crawler = require('./src/crawler'), | ||
printStats = require('./src/log/final'); | ||
|
||
let config, stats = { | ||
pageCount: 0, | ||
violationCounts: {}, | ||
foundHttpLinks: {}, | ||
passedAuditsCount: 0, | ||
startTime: null, | ||
auditTimesByPageUrl: {} | ||
totalErrorCount: 0, | ||
startTime: new Date() | ||
}; | ||
|
||
function discoverResources(buffer, item) { | ||
const page = cheerio.load(buffer.toString('utf8')) | ||
var links = page('a[href]').map(function () { | ||
return page(this).attr('href') | ||
}).get() | ||
|
||
if(config.limit){ | ||
links = links.filter(function(s){ | ||
return ~s.indexOf(config.limit); | ||
}); | ||
} | ||
|
||
return links | ||
} | ||
|
||
module.exports = (options) => { | ||
console.log("ô¿ô light-mc-crawler has started crawling. If it looks like nothing is happening, wait, it is :)"); | ||
|
||
stats.startTime = new Date() | ||
config = JSON.parse(fs.readFileSync(path.resolve(options.config))) | ||
|
||
const configPath = path.resolve(options.config) | ||
const config = JSON.parse(fs.readFileSync(configPath)) | ||
const lighthouseQueue = queue(config, stats); | ||
const crawler = Crawler(config); | ||
|
||
const crawler = new Crawler(options.url || config.url) | ||
crawler.respectRobotsTxt = false | ||
crawler.parseHTMLComments = false | ||
crawler.parseScriptTags = false | ||
crawler.userAgent = options.userAgent || "light-mc-crawler Mixed Content Crawler" | ||
crawler.maxDepth = config.maxDepth || 1 | ||
|
||
|
||
crawler.discoverResources = (buffer, item) => { | ||
const page = cheerio.load(buffer.toString('utf8')) | ||
var links = page('a[href]').map(function () { | ||
return page(this).attr('href') | ||
}).get() | ||
|
||
if(config.limit){ | ||
links = links.filter(function(s){ | ||
return ~s.indexOf(config.limit); | ||
}); | ||
} | ||
|
||
if(config.showHttpLinksDuring || config.showHttpLinksAfter){ | ||
links.forEach(function(link) { | ||
if(link.indexOf('http://') !== -1){ | ||
if(!stats.foundHttpLinks[item.url]){ | ||
stats.foundHttpLinks[item.url] = []; | ||
} | ||
|
||
stats.foundHttpLinks[item.url].push(link) | ||
} | ||
}); | ||
|
||
if(config.showHttpLinksDuring && stats.foundHttpLinks[item.url]){ | ||
console.log(); | ||
console.log('Http link(s) on '.bold.underline + item.url.bold.underline); | ||
stats.foundHttpLinks[item.url].forEach(function(link) { | ||
console.log(' ' + link); | ||
}); | ||
} | ||
} | ||
|
||
return links | ||
} | ||
|
||
let totalErrorCount = 0 | ||
|
||
const lighthouseQueue = queue((url, callback) => { | ||
runLighthouse(url, config, (errorCount) => { | ||
totalErrorCount += errorCount | ||
callback() | ||
}) | ||
}, config.maxChromeInstances || 5) | ||
crawler.discoverResources = discoverResources; | ||
|
||
crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => { | ||
lighthouseQueue.push(queueItem.url) | ||
}) | ||
}); | ||
|
||
crawler.once('complete', () => { | ||
lighthouseQueue.drain = () => { | ||
printStats(config) | ||
if (totalErrorCount > 0) { | ||
printStats(stats) | ||
if (stats.totalErrorCount > 0) { | ||
process.exit(1) | ||
} | ||
} | ||
}) | ||
|
||
crawler.start() | ||
} | ||
|
||
function runLighthouse (url, config, callback) { | ||
if(config.httpsOnly){ | ||
url = url.replace("http://", "https://"); | ||
} | ||
|
||
stats.pageCount++ | ||
var mixedContent = require.resolve('lighthouse/lighthouse-core/config/mixed-content.js') | ||
var chromeFlags = config.chromeFlags || '--headless --disable-gpu'; | ||
var userAgent = config.userAgent || 'light-mc-crawler Mixed Content Crawler' | ||
const args = [ | ||
url, | ||
'--output=json', | ||
'--output-path=stdout', | ||
'--disable-device-emulation', | ||
'--disable-cpu-throttling', | ||
'--disable-storage-reset', | ||
'--disable-network-throttling', | ||
'--chrome-flags=' + chromeFlags + '--user-agent=' + userAgent, | ||
`--config-path=${mixedContent}` | ||
] | ||
|
||
const lighthousePath = require.resolve('lighthouse/lighthouse-cli/index.js') | ||
const lighthouse = ChildProcess.spawn(lighthousePath, args) | ||
|
||
let output = '' | ||
lighthouse.stdout.on('data', (data) => { | ||
output += data | ||
}) | ||
|
||
stats.auditTimesByPageUrl[url] = {startTime: new Date()} | ||
lighthouse.once('close', () => { | ||
stats.auditTimesByPageUrl[url].endTime = new Date() | ||
let errorCount = 0 | ||
|
||
let report | ||
try { | ||
report = JSON.parse(output) | ||
} catch (parseError) { | ||
console.log(); | ||
if(output != ''){ | ||
console.error(`Parsing JSON report output failed for ${url}: ${output}`); | ||
console.log(parseError); | ||
} else{ | ||
console.error(`Lighthouse report returned nothing for ${url}`); | ||
} | ||
|
||
callback(1) | ||
return | ||
} | ||
|
||
report.reportCategories.forEach((category) => { | ||
let displayedCategory = false | ||
category.audits.forEach((audit) => { | ||
if(audit.id != "is-on-https"){ | ||
//mixed-content is buggy atm, will work on fixing. | ||
//is-on-https seems to surface everything well enough | ||
return; | ||
} | ||
|
||
if (audit.score === 100) { | ||
stats.passedAuditsCount++ | ||
} else { | ||
if (!displayedCategory) { | ||
console.log(); | ||
console.log(category.name.bold.underline + ` current page count: ${stats.pageCount}`); | ||
displayedCategory = true | ||
} | ||
errorCount++ | ||
console.log(url.replace(/\/$/, ''), '\u2717'.red, audit.id.bold, '-', audit.result.description.italic) | ||
|
||
if (stats.violationCounts[category.name] === undefined) { | ||
stats.violationCounts[category.name] = 0 | ||
} | ||
|
||
if (audit.result.extendedInfo) { | ||
const {value} = audit.result.extendedInfo | ||
if (Array.isArray(value)) { | ||
stats.violationCounts[category.name] += value.length | ||
value.forEach((result) => { | ||
if (result.url) { | ||
console.log(` ${result.url}`) | ||
} | ||
}) | ||
} else if (Array.isArray(value.nodes)) { | ||
stats.violationCounts[category.name] += value.nodes.length | ||
const messagesToNodes = {} | ||
value.nodes.forEach((result) => { | ||
let message = result.failureSummary | ||
message = message.replace(/^Fix any of the following:/g, '').trim() | ||
if (messagesToNodes[message]) { | ||
messagesToNodes[message].push(result.html) | ||
} else { | ||
messagesToNodes[message] = [result.html] | ||
} | ||
}) | ||
Object.keys(messagesToNodes).forEach((message) => { | ||
console.log(` ${message}`) | ||
messagesToNodes[message].forEach(node => { | ||
console.log(` ${node}`.gray) | ||
}) | ||
}) | ||
} else { | ||
stats.violationCounts[category.name]++ | ||
} | ||
}else if(audit.result.details && audit.result.details.items){ | ||
audit.result.details.items.forEach((result) => { | ||
if (result[0].text) { | ||
console.log(` ${result[0].text}`) | ||
} | ||
}) | ||
} | ||
} | ||
}) | ||
}) | ||
|
||
callback(errorCount) | ||
}) | ||
} | ||
|
||
function printStats(config) { | ||
console.log(); | ||
console.log(); | ||
if(config.showHttpLinksAfter){ | ||
for(var index in stats.foundHttpLinks) { | ||
console.log('Http link(s) on '.bold.underline + index.bold.underline); | ||
stats.foundHttpLinks[index].forEach(function(link) { | ||
console.log(' ' + link); | ||
}); | ||
} | ||
} | ||
console.log(); | ||
console.log(); | ||
console.log('Lighthouse Summary'.bold.underline); | ||
console.log(` Total Pages Scanned: ${stats.pageCount}`); | ||
console.log(` Total Auditing Time: ${new Date() - stats.startTime} ms`); | ||
const totalTime = Object.keys(stats.auditTimesByPageUrl).reduce((sum, url) => { | ||
const {endTime, startTime} = stats.auditTimesByPageUrl[url] | ||
return (endTime - startTime) + sum | ||
}, 0) | ||
console.log(` Average Page Audit Time: ${Math.round(totalTime/stats.pageCount)} ms`); | ||
console.log(` Total Audits Passed: ${stats.passedAuditsCount}`, '\u2713'.green); | ||
if (Object.keys(stats.violationCounts).length === 0) { | ||
console.log(` Total Violations: None! \\o/ 🎉`); | ||
} else { | ||
console.log(` Total Violations:`); | ||
Object.keys(stats.violationCounts).forEach(category => { | ||
console.log(` ${category}: ${stats.violationCounts[category]}`, '\u2717'.red); | ||
}) | ||
} | ||
} | ||
} |
Oops, something went wrong.