Crawler.js
Från SHF wiki
Version från den 27 oktober 2021 kl. 08.59 av WikiSysop (diskussion | bidrag)
Koden nedan listar alla sidor och filer på en mediawiki. Den listar även vilka filer som refereras från varje sida.
Koden nedan är skriven för node.js.
const http = require('http') async function visitNext(state) { return new Promise((resolve, reject) => { const title = state.queue.pop(); if(!title) { reject('visitNext() is called when queue is empty'); } const options = { hostname: '81.91.1.84', port: 80, path: '/mediawiki/api.php?action=query&prop=links&format=json&pllimit=500&titles=' + encodeURIComponent(title), method: 'GET' } http.get(options, res => { let data = []; res.on('data', chunk => { data.push(chunk); }); res.on('end', () => { const response = JSON.parse(Buffer.concat(data).toString()); const pages = response.query.pages; for(index in pages) { const page = pages[index]; if(page.links) { page.links.forEach(link => { if(!state.known[link.title]){ state.known[link.title] = link.title; state.queue.push(link.title); } }); } } resolve(state); }); }).on('error', err => { reject('http error: ', err.message); }); }); } async function extractImages(title, state) { return new Promise((resolve, reject) => { if(!title) { reject('extractFiles() is called with empty page title'); } const options = { hostname: '81.91.1.84', port: 80, path: '/mediawiki/api.php?action=query&prop=images&format=json&imlimit=500&titles=' + encodeURIComponent(title), method: 'GET' } http.get(options, res => { let data = []; res.on('data', chunk => { data.push(chunk); }); res.on('end', () => { const response = JSON.parse(Buffer.concat(data).toString()); const pages = response.query.pages; for(index in pages) { const page = pages[index]; state.references[title] = []; if(page.images) { page.images.forEach(image => { state.titles[image.title] = image.title; state.references[page.title].push(image.title); }); } } resolve(state); }); }).on('error', err => { reject('http error: ', err.message); }); }); } // add the pages in the initial queue to the known pages. async function crawl() { let known = {}; let queue = ['Huvudsida', 'Välkommen']; queue.forEach( title => known[title] = title ); let pageTitles = {known, queue}; // find all pages while(pageTitles.queue.length) { pageTitles = await visitNext(pageTitles); } // print page titles console.log('--- page titles ------------------------------------------------'); for(title in pageTitles.known) { console.log(title); } // for each page, find all files let images = {titles:{}, references:{}}; for(title in pageTitles.known) { await extractImages(title, images); } // print images console.log('--- image titles ------------------------------------------------'); for(title in images.titles) { console.log(title); } // print images references console.log('--- image references ------------------------------------------------'); for(title in pageTitles.known) { if(images.references[title].length) { console.log(title); images.references[title].forEach(ref => { console.log(' ' + ref); }); } } }
crawl();