package edu.umass.cs.mallet.base.classify.tui;

import com.hp.hpl.jena.sparql.sse.Tags;
import edu.umass.cs.mallet.base.pipe.CharSequence2TokenSequence;
import edu.umass.cs.mallet.base.pipe.CharSubsequence;
import edu.umass.cs.mallet.base.pipe.FeatureSequence2AugmentableFeatureVector;
import edu.umass.cs.mallet.base.pipe.Input2CharSequence;
import edu.umass.cs.mallet.base.pipe.Noop;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.SaveDataInSource;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.Target2Label;
import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureSequence;
import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureSequenceWithBigrams;
import edu.umass.cs.mallet.base.pipe.TokenSequenceLowercase;
import edu.umass.cs.mallet.base.pipe.TokenSequenceNGrams;
import edu.umass.cs.mallet.base.pipe.TokenSequenceRemoveNonAlpha;
import edu.umass.cs.mallet.base.pipe.TokenSequenceRemoveStopwords;
import edu.umass.cs.mallet.base.pipe.iterator.FileIterator;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.base.util.CharSequenceLexer;
import edu.umass.cs.mallet.base.util.CommandOption;
import edu.umass.cs.mallet.base.util.MalletLogger;
import edu.umass.cs.mallet.base.util.Strings;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.logging.Logger;

/* JADX WARN: Classes with same name are omitted:
  input_file:WEB-INF/lib/bibsonomy-scraper-2.0.1.jar:org/bibsonomy/scraper/ie/training/mallet.jar:edu/umass/cs/mallet/base/classify/tui/Text2Vectors.class
 */
/* loaded from: input_file:WEB-INF/lib/mallet-0.4-steuber.jar:edu/umass/cs/mallet/base/classify/tui/Text2Vectors.class */
public class Text2Vectors {
    private static Logger logger = MalletLogger.getLogger(Text2Vectors.class.getName());
    static CommandOption.SpacedStrings classDirs = new CommandOption.SpacedStrings(Text2Vectors.class, "input", "DIR...", true, null, "The directories containing text files to be classified, one directory per class", null);
    static CommandOption.File outputFile = new CommandOption.File(Text2Vectors.class, "output", "FILE", true, new File("text.vectors"), "Write the instance list to this file; Using - indicates stdout.", null);
    static CommandOption.File usePipeFromVectorsFile = new CommandOption.File(Text2Vectors.class, "use-pipe-from", "FILE", true, new File("text.vectors"), "Use the pipe and alphabets from a previously created vectors file. Allows the creation, for example, of a test set of vectors that arecompatible with a previously created set of training vectors", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(Text2Vectors.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(Text2Vectors.class, "remove-stopwords", "[TRUE|FALSE]", false, false, "If true, remove common \"stop words\" from the text.", null);
    static CommandOption.Boolean skipHeader = new CommandOption.Boolean(Text2Vectors.class, "skip-header", "[TRUE|FALSE]", false, false, "If true, in each document, remove text occurring before a blank line.  This is useful for removing email or UseNet headers", null);
    static CommandOption.Boolean skipHtml = new CommandOption.Boolean(Text2Vectors.class, "skip-html", "[TRUE|FALSE]", false, false, "If true, remove text occurring inside <...>, as in HTML or SGML.", null);
    static CommandOption.IntegerArray gramSizes = new CommandOption.IntegerArray(Text2Vectors.class, "gram-sizes", "INTEGER [INTEGER ...]", true, new int[]{1}, "Include among the features all n-grams of sizes specified.  This option occurs after the removal of stop words, if removed.", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(Text2Vectors.class, "keep-sequence", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.Boolean keepSequenceBigrams = new CommandOption.Boolean(Text2Vectors.class, "keep-sequence-bigrams", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequenceWithBigrams rather than a FeatureVector.", null);
    static CommandOption.Object stringPipe = new CommandOption.Object(Text2Vectors.class, "string-pipe", "Pipe constructor", true, null, "Java code for the constructor of a Pipe to be run as soon as input becomes a CharSequence", null);
    static CommandOption.Object tokenPipe = new CommandOption.Object(Text2Vectors.class, "token-pipe", "Pipe constructor", true, null, "Java code for the constructor of a Pipe to be run as soon as input becomes a TokenSequence", null);
    static CommandOption.Object fvPipe = new CommandOption.Object(Text2Vectors.class, "fv-pipe", "Pipe constructor", true, null, "Java code for the constructor of a Pipe to be run as soon as input becomes a FeatureVector", null);

    public static void main(String[] strArr) throws FileNotFoundException, IOException {
        Pipe pipe;
        CommandOption.setSummary(Text2Vectors.class, "A tool for creating instance lists of feature vectors from text documents.\nLast arguments must be list of directories containing text files to be classified,\none directory per class.");
        CommandOption.process(Text2Vectors.class, strArr);
        if (strArr.length == 0) {
            CommandOption.getList(Text2Vectors.class).printUsage(false);
            System.exit(-1);
        }
        if (classDirs.value.length == 0) {
            System.err.println("You must include --input DIR1 DIR2 ...' in order to specify alist of directories containing the documents for each class.");
            System.exit(-1);
        }
        int commonPrefixIndex = Strings.commonPrefixIndex(classDirs.value);
        logger.info("Labels = ");
        File[] fileArr = new File[classDirs.value.length];
        for (int i = 0; i < classDirs.value.length; i++) {
            fileArr[i] = new File(classDirs.value[i]);
            if (commonPrefixIndex < classDirs.value.length) {
                logger.info("   " + classDirs.value[i].substring(commonPrefixIndex));
            } else {
                logger.info("   " + classDirs.value[i]);
            }
        }
        InstanceList instanceList = null;
        if (usePipeFromVectorsFile.wasInvoked()) {
            instanceList = InstanceList.load(usePipeFromVectorsFile.value);
            pipe = instanceList.getPipe();
        } else {
            Pipe[] pipeArr = new Pipe[14];
            pipeArr[0] = new Target2Label();
            pipeArr[1] = new SaveDataInSource();
            pipeArr[2] = new Input2CharSequence();
            pipeArr[3] = stringPipe.wasInvoked() ? (Pipe) stringPipe.value : new Noop();
            pipeArr[4] = skipHeader.value ? new CharSubsequence(CharSubsequence.SKIP_HEADER) : new Noop();
            pipeArr[5] = keepSequenceBigrams.value ? new CharSequence2TokenSequence(CharSequenceLexer.LEX_NONWHITESPACE_CLASSES) : new CharSequence2TokenSequence();
            pipeArr[6] = tokenPipe.wasInvoked() ? (Pipe) tokenPipe.value : new Noop();
            pipeArr[7] = preserveCase.value ? new Noop() : new TokenSequenceLowercase();
            pipeArr[8] = keepSequenceBigrams.value ? new TokenSequenceRemoveNonAlpha(true) : new Noop();
            pipeArr[9] = removeStopWords.value ? new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value) : new Noop();
            pipeArr[10] = (gramSizes.value.length == 1 && gramSizes.value[0] == 1) ? new Noop() : new TokenSequenceNGrams(gramSizes.value);
            pipeArr[11] = keepSequenceBigrams.value ? new TokenSequence2FeatureSequenceWithBigrams() : new TokenSequence2FeatureSequence();
            pipeArr[12] = (keepSequence.value || keepSequenceBigrams.value) ? new Noop() : new FeatureSequence2AugmentableFeatureVector();
            pipeArr[13] = fvPipe.wasInvoked() ? (Pipe) fvPipe.value : new Noop();
            pipe = new SerialPipes(pipeArr);
        }
        InstanceList instanceList2 = new InstanceList(pipe);
        instanceList2.add(new FileIterator(fileArr, FileIterator.STARTING_DIRECTORIES, true));
        ObjectOutputStream objectOutputStream = outputFile.value.toString().equals(Tags.symMinus) ? new ObjectOutputStream(System.out) : new ObjectOutputStream(new FileOutputStream(outputFile.value));
        objectOutputStream.writeObject(instanceList2);
        objectOutputStream.close();
        if (usePipeFromVectorsFile.wasInvoked()) {
            System.out.println(" output usepipe ilist pipe instance id =" + instanceList.getPipe().getInstanceId());
            ObjectOutputStream objectOutputStream2 = new ObjectOutputStream(new FileOutputStream(usePipeFromVectorsFile.value));
            objectOutputStream2.writeObject(instanceList);
            objectOutputStream2.close();
        }
    }
}
