package com.aliasi.corpus.parsers;

import com.aliasi.classify.BinaryLMClassifier;
import com.aliasi.classify.Classification;
import com.aliasi.corpus.ClassificationHandler;
import com.aliasi.corpus.Corpus;
import com.aliasi.corpus.StringParser;
import com.aliasi.io.FileExtensionFilter;
import com.aliasi.util.Strings;
import java.io.File;
import java.io.IOException;

@Deprecated
/* loaded from: input_file:com/aliasi/corpus/parsers/Reuters21578Parser.class */
public class Reuters21578Parser extends StringParser<ClassificationHandler<CharSequence, Classification>> {
    private final boolean mIncludeTestDocuments;
    private final boolean mIncludeTrainingDocuments;
    private final String mTopic;
    static final String END_BOILERPLATE_1 = "Reuter&#3;";
    static final String END_BOILERPLATE_2 = "REUTER&#3;";
    static final Classification ON_TOPIC = new Classification(BinaryLMClassifier.DEFAULT_ACCEPT_CATEGORY);
    static final Classification OFF_TOPIC = new Classification(BinaryLMClassifier.DEFAULT_REJECT_CATEGORY);
    static final String[] TOPICS = {"acq", "alum", "austdlr", "barley", "bean", "belly", "bfr", "bop", "cake", "can", "carcass", "castor", "castorseed", "cattle", "chem", "citruspulp", "cocoa", "coconut", "coffee", "copper", "copra", "corn", "cornglutenfeed", "cotton", "cottonseed", "cpi", "cpu", "crude", "cruzado", "debt", "dfl", "dkr", "dlr", "dmk", "earn", "f", "feed", "fishmeal", "fuel", "fx", "gas", "gnp", "gold", "grain", "groundnut", "heat", "hk", "hog", "housing", "income", "instal", "interest", "inventories", "ipi", "iron", "jet", "jobs", "l", "lead", "lei", "lin", "linseed", "lit", "livestock", "lumber", "meal", "metal", "money", "naphtha", "nat", "nickel", "nkr", "nzdlr", "oat", "oil", "oilseed", "orange", "palladium", "palm", "palmkernel", "peseta", "pet", "platinum", "plywood", "pork", "potato", "propane", "rand", "rape", "rapeseed", "red", "reserves", "retail", "rice", "ringgit", "rubber", "rupiah", "rye", "saudriyal", "sfr", "ship", "silver", "skr", "sorghum", "soy", "soybean", "steel", "stg", "strategic", "sugar", "sun", "sunseed", "supply", "tapioca", "tea", "tin", "trade", "veg", "wheat", "wool", "wpi", "yen", "zinc"};

    @Deprecated
    /* loaded from: input_file:com/aliasi/corpus/parsers/Reuters21578Parser$ReutersCorpus.class */
    private static class ReutersCorpus extends Corpus<ClassificationHandler<CharSequence, Classification>> {
        private final String mTopic;
        private final File mDirectory;

        ReutersCorpus(String str, File file) {
            this.mTopic = str;
            this.mDirectory = file;
        }

        @Override // com.aliasi.corpus.Corpus
        @Deprecated
        public void visitCorpus(ClassificationHandler<CharSequence, Classification> classificationHandler) throws IOException {
            visit(classificationHandler, true, true);
        }

        @Override // com.aliasi.corpus.Corpus
        @Deprecated
        public void visitTest(ClassificationHandler<CharSequence, Classification> classificationHandler) throws IOException {
            visit(classificationHandler, false, true);
        }

        @Override // com.aliasi.corpus.Corpus
        @Deprecated
        public void visitTrain(ClassificationHandler<CharSequence, Classification> classificationHandler) throws IOException {
            visit(classificationHandler, true, false);
        }

        @Deprecated
        void visit(ClassificationHandler<CharSequence, Classification> classificationHandler, boolean z, boolean z2) throws IOException {
            Reuters21578Parser reuters21578Parser = new Reuters21578Parser(this.mTopic, z, z2);
            reuters21578Parser.setHandler(classificationHandler);
            for (File file : this.mDirectory.listFiles(new FileExtensionFilter(".sgm"))) {
                reuters21578Parser.parse(file);
            }
        }
    }

    public Reuters21578Parser(String str, boolean z, boolean z2) {
        this.mIncludeTrainingDocuments = z;
        this.mIncludeTestDocuments = z2;
        this.mTopic = str;
        if (!isAvailableTopic(this.mTopic)) {
            throw new IllegalArgumentException("Require known topic. Found topic=" + str);
        }
    }

    @Override // com.aliasi.corpus.Parser
    public void parseString(char[] cArr, int i, int i2) {
        String[] split = new String(cArr, i, i2 - i).split("\n");
        int i3 = 0;
        while (i3 < split.length) {
            if (split[i3].startsWith("<REUTERS")) {
                StringBuilder sb = new StringBuilder();
                while (!split[i3].startsWith("</REUTERS")) {
                    int i4 = i3;
                    i3++;
                    sb.append(split[i4]);
                    sb.append("\n");
                }
                handleDocument(sb.toString());
            }
            i3++;
        }
    }

    void handleDocument(String str) {
        if (hasTopics(str)) {
            if (!isTrainingDocument(str) || this.mIncludeTrainingDocuments) {
                if (!isTestDocument(str) || this.mIncludeTestDocuments) {
                    String extract = extract("TOPICS", str, true);
                    String extract2 = extract("TITLE", str, true);
                    String extract3 = extract("DATELINE", str, true);
                    String extract4 = extract("BODY", str, true);
                    if (extract4.endsWith(END_BOILERPLATE_1) || extract4.endsWith(END_BOILERPLATE_2)) {
                        extract4 = extract4.substring(0, extract4.length() - END_BOILERPLATE_1.length());
                    }
                    StringBuilder sb = new StringBuilder();
                    sb.append(extract2 + "\n");
                    sb.append(extract3 + "\n");
                    sb.append(extract4);
                    ((ClassificationHandler) getHandler()).handle(sb, extract.indexOf(this.mTopic) >= 0 ? ON_TOPIC : OFF_TOPIC);
                }
            }
        }
    }

    static String extract(String str, String str2, boolean z) {
        String str3 = "<" + str + ">";
        String str4 = "</" + str + ">";
        int indexOf = str2.indexOf(str3);
        if (indexOf < 0) {
            if (z) {
                return Strings.EMPTY_STRING;
            }
            throw new IllegalArgumentException("no start, elt=" + str + " text=" + str2);
        }
        int length = indexOf + str3.length();
        int indexOf2 = str2.indexOf(str4, length);
        if (indexOf2 < 0) {
            throw new IllegalArgumentException("no end, elt=" + str + " text=" + str2);
        }
        return str2.substring(length, indexOf2);
    }

    public static String[] availableTopics() {
        String[] strArr = new String[TOPICS.length];
        for (int i = 0; i < strArr.length; i++) {
            strArr[i] = TOPICS[i];
        }
        return strArr;
    }

    public static boolean isAvailableTopic(String str) {
        for (String str2 : TOPICS) {
            if (str2.equals(str)) {
                return true;
            }
        }
        return false;
    }

    @Deprecated
    public static Corpus<ClassificationHandler<CharSequence, Classification>> corpus(String str, File file) throws IOException {
        return new ReutersCorpus(str, file);
    }

    static boolean hasTopics(String str) {
        return containsText(str, "TOPICS=\"Y");
    }

    static boolean isTrainingDocument(String str) {
        return containsText(str, "LEWISSPLIT=\"TR");
    }

    static boolean isTestDocument(String str) {
        return containsText(str, "LEWISSPLIT=\"TE");
    }

    static boolean containsText(String str, String str2) {
        return str.indexOf(str2) >= 0;
    }
}
