afs_doc_classify - AFS

AFS Filters Description

Product
AFS
AFS_Version
7.9
Category
Reference Guide

Thanks to a semantic classifier, the filter classifies a document by analyzing its semantic proximity to different categories

The filter is declared with the afs_doc_classify type. It is in the antidot-paf-misc package. It is a processor filter.

The Document classifier filter specifications are described in the following table:

Parameter name

Mandatory

Type

Default

Description

db_dir

Yes

directory

N/A

Path to the classification rules

input_layer

No

layer

CONTENTS

Input Layer

output_layer

No

layer

CONTENTS

Name of the layer to fill with contents of new files

threshold

No

float

0.01

Semantic proximity, between 0 (weak, will match lots of categories) and 1 (strong, fewer categories)

output_format

No

string

XML

Format into which the categories will be serialized

Filter configuration example:
           <!-- classification -->
<afs:filter uri="http://demo.antidot.net/kantarmedia/extract" type="afs_text_extract" instances="1" comment="Extraction des données pour classification et annotation">
    <afs:args>
        <afs:arg name="input_layer" value="CONTENTS"/>
        <afs:arg name="output_layer" value="USER_1"/>
        <afs:arg name="xpaths">
            <afs:list>
                <afs:param value="/document/fulltext"/>
            </afs:list>
        </afs:arg>
    </afs:args>
    <afs:successors>
        <afs:successor uri="http://demo.antidot.net/kantarmedia/classify"/>
    </afs:successors>
</afs:filter>
<afs:filter uri="http://demo.antidot.net/kantarmedia/classify" type="afs_doc_classify" instances="1" comment="Classification IPTC">
    <afs:args>
        <afs:arg name="db_dir" value="$AFS7/conf/classif/iptc"/> <!-- classif DB path -->
        <afs:arg name="input_layer" value="USER_1"/> <!-- input layer has to be raw text -->
        <afs:arg name="output_layer" value="VOLATILE_1"/> <!-- output layer will be protobuff -->
        <afs:arg name="threshold" value="0.5"/> <!-- between 0 and 1. Lower values lead to more annotations, higher values lead to less annotations -->
    </afs:args>
    <afs:successors>
        <afs:successor uri="http://demo.antidot.net/kantarmedia/pb2xml"/>
    </afs:successors>
</afs:filter>
<afs:filter uri="http://demo.antidot.net/kantarmedia/pb2xml" type="afs_protobuf_serialize" instances="1" comment="Conversion protobuff vers XML">
    <afs:args>
        <afs:arg name="protobuf_name" value="N_Classify.Classify"/>
        <afs:arg name="input_layer" value="VOLATILE_1"/>
        <afs:arg name="output_layer" value="USER_2"/>
    </afs:args>
    <afs:successors>
        <afs:successor uri="http://demo.antidot.net/kantarmedia/index"/>
    </afs:successors>
</afs:filter>

Filter output example:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<afs:Classify xmlns:afs="http://ref.antidot.net/v7/afs#" confidence="0.907">
    <afs:predict class_name="Science" proba="0.233"/>
    <afs:predict class_name="Humor" proba="0.212"/>
    <afs:predict class_name="Aerospace" proba="0.230"/>
    <afs:predict class_name="Nature" proba="0.233"/>
</afs:Classify>