package edu.umass.cs.mallet.base.pipe;

import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import edu.umass.cs.mallet.base.extract.StringSpan;
import edu.umass.cs.mallet.base.extract.StringTokenization;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.Token;
import edu.umass.cs.mallet.base.types.TokenSequence;
import edu.umass.cs.mallet.base.util.CharSequenceLexer;
import edu.umass.cs.mallet.base.util.MalletLogger;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;

/* loaded from: input_file:WEB-INF/lib/mallet-0.4-jaeschke.jar:edu/umass/cs/mallet/base/pipe/SGML2TokenSequence.class */
public class SGML2TokenSequence extends Pipe implements Serializable {
    private static Logger logger = MalletLogger.getLogger(SGML2TokenSequence.class.getName());
    Pattern sgmlPattern;
    CharSequenceLexer lexer;
    String backgroundTag;
    private boolean saveSource;
    private static final long serialVersionUID = 1;
    private static final int CURRENT_SERIAL_VERSION = 1;

    public SGML2TokenSequence(CharSequenceLexer charSequenceLexer, String str, boolean z) {
        this.sgmlPattern = Pattern.compile("</?([^>]*)>");
        this.saveSource = true;
        this.lexer = charSequenceLexer;
        this.backgroundTag = str;
        this.saveSource = z;
    }

    public SGML2TokenSequence(CharSequenceLexer charSequenceLexer, String str) {
        this.sgmlPattern = Pattern.compile("</?([^>]*)>");
        this.saveSource = true;
        this.lexer = charSequenceLexer;
        this.backgroundTag = str;
    }

    public SGML2TokenSequence(String str, String str2) {
        this.sgmlPattern = Pattern.compile("</?([^>]*)>");
        this.saveSource = true;
        this.lexer = new CharSequenceLexer(str);
        this.backgroundTag = str2;
    }

    public SGML2TokenSequence() {
        this(new CharSequenceLexer(), PDAnnotationLink.HIGHLIGHT_MODE_OUTLINE);
    }

    @Override // edu.umass.cs.mallet.base.pipe.Pipe
    public Instance pipe(Instance instance) {
        int start;
        CharSequence charSequence = (CharSequence) instance.getData();
        StringTokenization stringTokenization = new StringTokenization(charSequence);
        TokenSequence tokenSequence = new TokenSequence();
        String str = this.backgroundTag;
        String str2 = this.backgroundTag;
        Matcher matcher = this.sgmlPattern.matcher(charSequence);
        int i = 0;
        int i2 = 0;
        boolean z = false;
        logger.fine(this.sgmlPattern.pattern());
        logger.finer(charSequence.toString());
        while (!z) {
            z = !matcher.find();
            if (z) {
                start = charSequence.length();
            } else {
                String group = matcher.group();
                logger.finer("SGML = " + group);
                logger.finer(Integer.toString(matcher.groupCount()));
                str2 = group.charAt(1) == '/' ? this.backgroundTag : group.substring(1, group.length() - 1);
                logger.finer("nextTag: " + str2);
                i2 = matcher.end();
                start = matcher.start();
                logger.finer("Text start/end " + i + MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR + start);
            }
            if (start - i > 0) {
                logger.finer("Tag = " + str);
                logger.finer("Target = " + ((Object) charSequence.subSequence(i, start)));
                this.lexer.setCharSequence(charSequence.subSequence(i, start));
                while (this.lexer.hasNext()) {
                    this.lexer.next();
                    stringTokenization.add((Token) new StringSpan(charSequence, i + this.lexer.getStartOffset(), i + this.lexer.getEndOffset()));
                    tokenSequence.add(new Token(str));
                }
            }
            i = i2;
            str = str2;
        }
        instance.setData(stringTokenization);
        instance.setTarget(tokenSequence);
        if (this.saveSource) {
            instance.setSource(stringTokenization);
        }
        return instance;
    }

    public static void main(String[] strArr) {
        try {
            SerialPipes serialPipes = new SerialPipes(new Pipe[]{new Input2CharSequence(), new SGML2TokenSequence()});
            for (int i = 0; i < strArr.length; i++) {
                Instance instance = new Instance(new File(strArr[i]), null, null, null, serialPipes);
                TokenSequence tokenSequence = (TokenSequence) instance.getData();
                TokenSequence tokenSequence2 = (TokenSequence) instance.getTarget();
                logger.finer("===");
                logger.info(strArr[i]);
                for (int i2 = 0; i2 < tokenSequence.size(); i2++) {
                    logger.info(tokenSequence2.getToken(i2).getText() + MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR + tokenSequence.getToken(i2).getText());
                }
            }
        } catch (Exception e) {
            System.out.println(e);
            e.printStackTrace();
        }
    }

    private void writeObject(ObjectOutputStream objectOutputStream) throws IOException {
        objectOutputStream.writeInt(1);
        objectOutputStream.writeObject(this.sgmlPattern);
        objectOutputStream.writeObject(this.lexer);
        objectOutputStream.writeObject(this.backgroundTag);
        objectOutputStream.writeBoolean(this.saveSource);
    }

    private void readObject(ObjectInputStream objectInputStream) throws IOException, ClassNotFoundException {
        int readInt = objectInputStream.readInt();
        this.sgmlPattern = (Pattern) objectInputStream.readObject();
        this.lexer = (CharSequenceLexer) objectInputStream.readObject();
        this.backgroundTag = (String) objectInputStream.readObject();
        if (readInt == 0) {
            this.saveSource = true;
        }
    }
}
