package edu.umass.cs.mallet.util.bibsonomy.clustering;

import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.Token;
import edu.umass.cs.mallet.base.types.TokenSequence;
import edu.umass.cs.mallet.base.util.CharSequenceLexer;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:WEB-INF/lib/bibsonomy-scraper-2.0.11.jar:edu/umass/cs/mallet/util/bibsonomy/clustering/SGML2FieldsPipe.class */
public class SGML2FieldsPipe extends Pipe implements Serializable {
    String sgmlRegex;
    Pattern sgmlPattern;
    CharSequenceLexer lexer;
    String lexerRegex;
    String backgroundTag;
    String refNoMeta;
    String clusterNoMeta;
    String clusterNoMeta_true;
    String[] startTags;
    String[] endTags;
    double[] tagWeight;
    String Source;
    String TEXT_NO_TAG;
    private static final long serialVersionUID = 1;
    private static final int CURRENT_SERIAL_VERSION = 0;
    static final /* synthetic */ boolean $assertionsDisabled;

    public SGML2FieldsPipe(CharSequenceLexer charSequenceLexer, String str, String str2, String str3, String[] strArr, String[] strArr2, double[] dArr) {
        this.sgmlRegex = "</?([^>]*)>";
        this.sgmlPattern = Pattern.compile(this.sgmlRegex);
        this.clusterNoMeta_true = "true_id=";
        this.Source = "SOURCE";
        this.TEXT_NO_TAG = "TEXT_NO_TAG";
        this.lexer = charSequenceLexer;
        this.backgroundTag = str;
        this.lexerRegex = charSequenceLexer.getPattern();
        this.refNoMeta = str2;
        this.clusterNoMeta = str3;
        this.startTags = strArr;
        this.endTags = strArr2;
        this.tagWeight = dArr;
    }

    public SGML2FieldsPipe(String str, String str2, String str3, String str4, String[] strArr, String[] strArr2, double[] dArr) {
        this.sgmlRegex = "</?([^>]*)>";
        this.sgmlPattern = Pattern.compile(this.sgmlRegex);
        this.clusterNoMeta_true = "true_id=";
        this.Source = "SOURCE";
        this.TEXT_NO_TAG = "TEXT_NO_TAG";
        this.lexer = new CharSequenceLexer(str);
        this.backgroundTag = str2;
        this.refNoMeta = str3;
        this.clusterNoMeta = str4;
        this.startTags = strArr;
        this.endTags = strArr2;
        this.tagWeight = dArr;
    }

    public SGML2FieldsPipe(String str, String str2, String[] strArr, String[] strArr2, double[] dArr) {
        this(new CharSequenceLexer(), "O", str, str2, strArr, strArr2, dArr);
    }

    @Override // edu.umass.cs.mallet.base.pipe.Pipe
    public Instance pipe(Instance instance) {
        int start;
        TokenSequence tokenSequence = new TokenSequence();
        TokenSequence tokenSequence2 = new TokenSequence();
        CharSequence charSequence = (CharSequence) instance.getData();
        String str = this.backgroundTag;
        String str2 = this.backgroundTag;
        Matcher matcher = this.sgmlPattern.matcher(charSequence);
        int i = 0;
        int i2 = 0;
        boolean z = false;
        String str3 = "";
        while (!z) {
            z = !matcher.find();
            if (z) {
                start = charSequence.length() - 1;
            } else {
                String group = matcher.group();
                matcher.groupCount();
                str2 = group.charAt(1) == '/' ? this.backgroundTag : matcher.group(1).intern();
                i2 = matcher.end();
                start = matcher.start();
            }
            if (start - i > 0) {
                tokenSequence.add(new Token((String) charSequence.subSequence(i, start)));
                tokenSequence2.add(new Token(str));
                str3 = str3 + ((String) charSequence.subSequence(i, start)) + " ";
            }
            i = i2;
            str = str2;
        }
        instance.setData(tokenSequence);
        instance.setTarget(tokenSequence2);
        instance.setSource(tokenSequence);
        instance.setProperty(this.Source, charSequence);
        instance.setProperty(this.TEXT_NO_TAG, str3);
        Object[] split = ((String) instance.getName()).split(":");
        if (!$assertionsDisabled && split.length != 2 && split.length != 3) {
            throw new AssertionError();
        }
        Object obj = split[0];
        Object obj2 = split[1];
        instance.setProperty(this.refNoMeta, obj);
        instance.setProperty(this.clusterNoMeta, obj2);
        if (split.length == 3) {
            instance.setProperty(this.clusterNoMeta_true, split[2]);
        }
        for (int i3 = 0; i3 < this.startTags.length; i3++) {
            instance.setProperty(this.startTags[i3], SGMLStringOperation.locateField(this.startTags[i3], this.endTags[i3], (String) charSequence));
        }
        return instance;
    }

    public static void main(String[] strArr) {
    }

    private void writeObject(ObjectOutputStream objectOutputStream) throws IOException {
        objectOutputStream.writeInt(0);
        objectOutputStream.writeObject(this.sgmlRegex);
        objectOutputStream.writeObject(this.backgroundTag);
        objectOutputStream.writeObject(this.lexerRegex);
    }

    private void readObject(ObjectInputStream objectInputStream) throws IOException, ClassNotFoundException {
        objectInputStream.readInt();
        this.sgmlRegex = (String) objectInputStream.readObject();
        this.sgmlPattern = Pattern.compile(this.sgmlRegex);
        this.backgroundTag = (String) objectInputStream.readObject();
        this.lexerRegex = (String) objectInputStream.readObject();
        this.lexer = new CharSequenceLexer(this.lexerRegex);
    }

    static {
        $assertionsDisabled = !SGML2FieldsPipe.class.desiredAssertionStatus();
    }
}
