Crawler.js

Från SHF wiki
Version från den 27 oktober 2021 kl. 08.59 av WikiSysop (diskussion | bidrag)
(skillnad) ← Äldre version | Nuvarande version (skillnad) | Nyare version → (skillnad)

Koden nedan listar alla sidor och filer på en mediawiki. Den listar även vilka filer som refereras från varje sida.

Koden nedan är skriven för node.js.

 const http = require('http')
 async function visitNext(state) {
   return new Promise((resolve, reject) => {
     const title = state.queue.pop();
     if(!title) {
       reject('visitNext() is called when queue is empty');
     }
     const options = {
       hostname: '81.91.1.84',
       port: 80,
       path: '/mediawiki/api.php?action=query&prop=links&format=json&pllimit=500&titles=' + encodeURIComponent(title),
       method: 'GET'
     }
     http.get(options, res => {
       let data = [];
       res.on('data', chunk => {
         data.push(chunk);
       });
       res.on('end', () => {
         const response = JSON.parse(Buffer.concat(data).toString());
         const pages = response.query.pages;
         for(index in pages) {
           const page = pages[index];
           if(page.links) {
             page.links.forEach(link => {
               if(!state.known[link.title]){
                 state.known[link.title] = link.title;
                 state.queue.push(link.title);
               }
             });
           }
         }
         resolve(state);
       });
     }).on('error', err => {
       reject('http error: ', err.message);
     });
   });
 }
 async function extractImages(title, state) {
   return new Promise((resolve, reject) => {
     if(!title) {
       reject('extractFiles() is called with empty page title');
     }
     const options = {
       hostname: '81.91.1.84',
       port: 80,
       path: '/mediawiki/api.php?action=query&prop=images&format=json&imlimit=500&titles=' + encodeURIComponent(title),
       method: 'GET'
     }
     http.get(options, res => {
       let data = [];
       res.on('data', chunk => {
         data.push(chunk);
       });
       res.on('end', () => {
         const response = JSON.parse(Buffer.concat(data).toString());
         const pages = response.query.pages;
         for(index in pages) {
           const page = pages[index];
           state.references[title] = [];
           if(page.images) {
             page.images.forEach(image => {
               state.titles[image.title] = image.title;
               state.references[page.title].push(image.title);
             });
           }
         }
         resolve(state);
       });
     }).on('error', err => {
       reject('http error: ', err.message);
     });
   });
 }
 // add the pages in the initial queue to the known pages. 
 async function crawl() {
   let known = {};
   let queue = ['Huvudsida', 'Välkommen'];
   queue.forEach(
    title => known[title] = title
   );
   let pageTitles = {known, queue};
   // find all pages
   while(pageTitles.queue.length) {
     pageTitles = await visitNext(pageTitles);
   }
   // print page titles
   console.log('--- page titles ------------------------------------------------');
   for(title in pageTitles.known) {
     console.log(title);
   }
   // for each page, find all files
   let images = {titles:{}, references:{}};
   for(title in pageTitles.known) {
     await extractImages(title, images);
   }
   // print images
   console.log('--- image titles ------------------------------------------------');
   for(title in images.titles) {
     console.log(title);
   }
   // print images references
   console.log('--- image references ------------------------------------------------');
   for(title in pageTitles.known) {
     if(images.references[title].length) {
       console.log(title);
       images.references[title].forEach(ref => {
         console.log('   ' + ref);
       });
     }
   }
 }
 crawl();