package edu.umass.cs.mallet.base.pipe;

import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.Token;
import edu.umass.cs.mallet.base.types.TokenSequence;
import edu.umass.cs.mallet.base.util.CharSequenceLexer;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;

/* loaded from: input_file:WEB-INF/lib/mallet-0.4-jaeschke.jar:edu/umass/cs/mallet/base/pipe/SelectiveSGML2TokenSequence.class */
public class SelectiveSGML2TokenSequence extends Pipe implements Serializable {
    Pattern sgmlPattern;
    CharSequenceLexer lexer;
    String backgroundTag;
    Set allowedTags;
    private static final long serialVersionUID = 1;
    private static final int CURRENT_SERIAL_VERSION = 0;

    public SelectiveSGML2TokenSequence(CharSequenceLexer charSequenceLexer, String str, Set set) {
        this.sgmlPattern = Pattern.compile("</?([^>]*)>");
        this.lexer = charSequenceLexer;
        this.backgroundTag = str;
        this.allowedTags = set;
    }

    public SelectiveSGML2TokenSequence(String str, String str2, Set set) {
        this(new CharSequenceLexer(str), str2, set);
    }

    public SelectiveSGML2TokenSequence(Set set) {
        this(new CharSequenceLexer(), "O", set);
    }

    public SelectiveSGML2TokenSequence(CharSequenceLexer charSequenceLexer, Set set) {
        this(charSequenceLexer, "O", set);
    }

    @Override // edu.umass.cs.mallet.base.pipe.Pipe
    public Instance pipe(Instance instance) {
        int start;
        if (!(instance.getData() instanceof CharSequence)) {
            throw new ClassCastException("carrier.data is a " + instance.getData().getClass().getName() + " not a CharSequence");
        }
        TokenSequence tokenSequence = new TokenSequence();
        TokenSequence tokenSequence2 = new TokenSequence();
        CharSequence charSequence = (CharSequence) instance.getData();
        String str = this.backgroundTag;
        String str2 = this.backgroundTag;
        Matcher matcher = this.sgmlPattern.matcher(charSequence);
        int i = 0;
        int i2 = 0;
        boolean z = false;
        while (!z) {
            z = !findNextValidMatch(matcher);
            if (z) {
                start = charSequence.length() - 1;
            } else {
                String group = matcher.group();
                matcher.groupCount();
                if (group.charAt(1) == '/') {
                    str2 = this.backgroundTag;
                } else {
                    matcher.group(0).intern();
                    str2 = group.substring(1, group.length() - 1);
                }
                i2 = matcher.end();
                start = matcher.start();
            }
            if (start - i > 0) {
                this.lexer.setCharSequence(charSequence.subSequence(i, start));
                while (this.lexer.hasNext()) {
                    tokenSequence.add(new Token((String) this.lexer.next()));
                    tokenSequence2.add(new Token(str));
                }
            }
            i = i2;
            str = str2;
        }
        instance.setData(tokenSequence);
        instance.setTarget(tokenSequence2);
        instance.setSource(tokenSequence);
        return instance;
    }

    private boolean findNextValidMatch(Matcher matcher) {
        if (!matcher.find()) {
            return false;
        }
        String group = matcher.group();
        int start = matcher.start();
        int i = 1;
        int length = group.length() - 1;
        if (group.charAt(1) == '/') {
            i = 2;
        }
        if (!this.allowedTags.contains(group.substring(i, length))) {
            return findNextValidMatch(matcher);
        }
        matcher.find(start);
        return true;
    }

    public String toString() {
        return ((("sgml pattern: " + this.sgmlPattern.toString()) + "\nlexer: " + this.lexer.getPattern().toString()) + "\nbg tag: " + this.backgroundTag.toString()) + "\nallowedHash: " + this.allowedTags + IOUtils.LINE_SEPARATOR_UNIX;
    }

    private void writeObject(ObjectOutputStream objectOutputStream) throws IOException {
        objectOutputStream.writeInt(0);
        objectOutputStream.writeObject(this.sgmlPattern);
        objectOutputStream.writeObject(this.lexer);
        objectOutputStream.writeObject(this.backgroundTag);
        objectOutputStream.writeObject(this.allowedTags);
    }

    private void readObject(ObjectInputStream objectInputStream) throws IOException, ClassNotFoundException {
        objectInputStream.readInt();
        this.sgmlPattern = (Pattern) objectInputStream.readObject();
        this.lexer = (CharSequenceLexer) objectInputStream.readObject();
        this.backgroundTag = (String) objectInputStream.readObject();
        this.allowedTags = (Set) objectInputStream.readObject();
    }
}
