Standard Crawl PaF Chain - AIF

AIF Crawl

Product
AIF
Category
Technical Notes
language
English
audience
public

The following PaF example correspond to a standard way to perform crawl with AIF.

<?xml version='1.0' encoding='UTF-8'?>
<afs:PaF xmlns:afs="http://ref.antidot.net/v7/afs#" name="CrawlDemo" service="42" status="beta">
<afs:pipe name="Crawl" run="once">
<afs:filter uri="#crawl" type="afs_web_crawl" comment="Crawling">
<afs:args>
<afs:arg name="user-agent" value="AntidotCrawler/1.0 (support@antidot.net)"/>
</afs:args>
<afs:successors>
<afs:successor uri="#convert" />
</afs:successors>
</afs:filter>
<afs:filter uri="#convert" type="afs_docbook_convert" instances="1" comment="Conversion en DocBook">
<afs:args>
<afs:arg name="output_layer" value="VOLATILE_1"/>
</afs:args>
<afs:successors>
<afs:successor uri="#follow" />
</afs:successors>
</afs:filter>
<afs:filter uri="#follow" type="afs_links_follow" instances="1" comment="Extraction des liens">
<afs:args>
<afs:arg name="maxdepth" value="3"/>
</afs:args>
<afs:successors>
<afs:successor uri="#clean" />
</afs:successors>
</afs:filter>
<afs:filter uri="#clean" type="afs_html_scrape" instances="1" comment="Nettoyage">
<afs:successors>
<afs:successor uri="#index" />
</afs:successors>
</afs:filter>
<afs:filter uri="#index" type="afs_doc_index" comment="Indexation des donnees" instances="1">
<afs:successors>
<afs:successor uri="#solve" />
</afs:successors>
</afs:filter>
<afs:filter uri="#solve" type="afs_links_solve" comment="Resolution des liens sortants (calcul du Pagerank)">
<afs:successors>
<afs:successor uri="#build" />
</afs:successors>
</afs:filter>
<afs:filter uri="#build" type="afs_search_build" comment="Creation de la base de reponse" >
<afs:successors>
<afs:successor uri="#deploy" />
</afs:successors>
</afs:filter>
<afs:filter uri="#deploy" type="afs_search_deploy" comment="Installation de la base de reponse"/>
</afs:pipe>
</afs:PaF>

paf_crawl_schema

Description:

  • afs_web_crawl: downloads URLs
  • afs_docbook_converter: convert into standardized xml format
  • afs_links_follow: apply links following strategy according to constraints defined with settings (general or per site)
  • afs_html_scrape: clean html code to keep only the relevant part (run javascript code part, remove advert banners, aso.)
  • afs_doc_index: index files previously cleaned
  • afs_links_solve: deal with external links (compute Pagerank)
  • afs_search_build: build reply database
  • afs_search_deploy: deploy reply database on reply hosts