package org.bibsonomy.scraper.url.kde.acm;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bibsonomy.model.util.BibTexUtils;
import org.bibsonomy.scraper.AbstractUrlScraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.Tuple;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.scraper.exceptions.ScrapingFailureException;
import org.bibsonomy.util.WebUtils;
import org.bibsonomy.util.XmlUtils;
import org.bibsonomy.util.id.DOIUtils;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/* loaded from: input_file:org/bibsonomy/scraper/url/kde/acm/ACMBasicScraper.class */
public class ACMBasicScraper extends AbstractUrlScraper {
    private Log log = LogFactory.getLog(ACMBasicScraper.class);
    private static final String BIBTEX_STRING_ON_ACM = "BibTeX";
    private static final String BROKEN_END = "},\n }";
    private static final String BETA_SITE_URL = "http://portal.acm.org/beta/";
    private static final String BETA_DOWNLOAD_PATH = "exportformats.cfm?";
    private static final String BETA_ABSTRACT_PATH = "tab_abstract.cfm?";
    private static final String BETA_DOWNLOAD_ID_PATTERN = "navigate[(]'.*?(id=.*)&.*[)]";
    private static final String P_TAG_CLASS_FONT = "<\\s*p\\s+class=[\"|']abstract[\"|']>\\s*<\\s*font\\s+color.*?<\\s*/\\s*p\\s*>";
    private static final String P_TAG_CLASS = ".*(<\\s*p\\s+class=[\"|']abstract[\"|']>.*)";
    private static final String P_TAG_CLASS_ENCLOSED = ".*<\\s*p\\s+class=[\"|']abstract[\"|']>\\s*(.*?)<\\s*/\\s*p\\s*>.*";
    private static final String P_START_TAG = "(<\\s*p\\s*>)(.*)";
    private static final String P_END_TAG = "(<\\s*/\\s*p\\s*>)(.*)";
    private static final String HTML_TAG = "\\<.*?\\>";
    private static final String MULTIPLE_WHITESPACE = "\\s{2,}";
    private static final String SITE_URL = "http://portal.acm.org/";
    private static final String SITE_NAME = "ACM Digital Library";
    private static final String info = "This scraper parses a publication page from the " + href(SITE_URL, SITE_NAME);
    private static final List<Tuple<Pattern, Pattern>> patterns = Collections.singletonList(new Tuple(Pattern.compile(".*portal.acm.org"), AbstractUrlScraper.EMPTY_PATTERN));

    @Override // org.bibsonomy.scraper.AbstractUrlScraper
    protected boolean scrapeInternal(ScrapingContext scrapingContext) throws ScrapingException {
        String extractBetaAbstract;
        scrapingContext.setScraper(this);
        try {
            StringBuffer stringBuffer = new StringBuffer();
            Document dom = XmlUtils.getDOM(scrapingContext.getPageContent());
            if (scrapingContext.getUrl().toString().matches(".*/beta/.*")) {
                stringBuffer.append(extractBibtexEntries(BETA_SITE_URL, extractSinglePathBeta(dom)).toString().trim());
                extractBetaAbstract = extractBetaAbstract(scrapingContext);
            } else {
                stringBuffer.append(extractBibtexEntries(SITE_URL, extractSinglePath(dom)));
                extractBetaAbstract = extractAbstract(dom);
            }
            int indexOf = stringBuffer.indexOf(BROKEN_END, (stringBuffer.length() - BROKEN_END.length()) - 1);
            if (indexOf > 0) {
                stringBuffer.replace(indexOf, stringBuffer.length(), "}\n}");
            }
            BibTexUtils.addFieldIfNotContained(stringBuffer, "url", scrapingContext.getUrl().toString());
            if (extractBetaAbstract == null) {
                extractBetaAbstract = extractAbstract(scrapingContext);
            }
            if (extractBetaAbstract != null) {
                BibTexUtils.addFieldIfNotContained(stringBuffer, "abstract", extractBetaAbstract);
            } else {
                this.log.info("ACMBasicScraper: Abstract not available");
            }
            String cleanDOI = DOIUtils.cleanDOI(stringBuffer.toString().trim());
            if ("".equals(cleanDOI)) {
                throw new ScrapingFailureException("getting bibtex failed");
            }
            scrapingContext.setBibtexResult(cleanDOI);
            return true;
        } catch (Exception e) {
            throw new InternalFailureException(e);
        }
    }

    private String extractAbstract(Document document) {
        Attr attributeNode;
        Attr attributeNode2;
        String str = null;
        NodeList elementsByTagName = document.getElementsByTagName("a");
        int i = 0;
        while (true) {
            if (i >= elementsByTagName.getLength()) {
                break;
            }
            Node item = elementsByTagName.item(i);
            if (item.hasAttributes() && (attributeNode = ((Element) item).getAttributeNode("name")) != null && "abstract".equals(attributeNode.getValue()) && "ABSTRACT".equals(item.getChildNodes().item(0).getNodeValue())) {
                Node nextSibling = item.getParentNode().getNextSibling().getNextSibling();
                if (nextSibling.hasAttributes() && (attributeNode2 = ((Element) nextSibling).getAttributeNode("class")) != null && "abstract".equals(attributeNode2.getValue())) {
                    Node nextSibling2 = nextSibling.getNextSibling();
                    str = (nextSibling2 == null || !"p".equals(nextSibling2.getNodeName())) ? XmlUtils.getText(nextSibling) : XmlUtils.getText(nextSibling2);
                }
            }
            i++;
        }
        return str;
    }

    private String extractBetaAbstract(ScrapingContext scrapingContext) throws ScrapingException, IOException {
        String str = null;
        Matcher matcher = Pattern.compile(BETA_DOWNLOAD_ID_PATTERN).matcher(scrapingContext.getPageContent());
        if (matcher.find()) {
            NodeList elementsByTagName = XmlUtils.getDOM(WebUtils.getContentAsString("http://portal.acm.org/beta/tab_abstract.cfm?" + matcher.group(1) + "&usebody=tabBody")).getElementsByTagName("div");
            for (int i = 0; i < elementsByTagName.getLength(); i++) {
                Node item = elementsByTagName.item(i);
                if (item.hasAttributes() && "display:inline".equals(item.getAttributes().getNamedItem("style").getNodeValue())) {
                    str = XmlUtils.getText(item);
                }
            }
        }
        return str;
    }

    final String extractAbstract(ScrapingContext scrapingContext) throws ScrapingException {
        String replaceAll = scrapingContext.getPageContent().replaceAll(P_TAG_CLASS_FONT, "");
        int i = 0;
        Matcher matcher = Pattern.compile(P_TAG_CLASS, 40).matcher(replaceAll);
        StringBuilder sb = new StringBuilder();
        if (matcher.find()) {
            sb.append(matcher.group(1));
        }
        int i2 = 0;
        int i3 = 0;
        int i4 = 1;
        int i5 = 1;
        int i6 = 0;
        int i7 = 0;
        boolean z = true;
        boolean z2 = true;
        while (true) {
            Matcher matcher2 = Pattern.compile(P_END_TAG, 40).matcher(sb);
            if (matcher2.find(i)) {
                i3 = matcher2.start();
                i7 = matcher2.group(1).length();
            }
            Matcher matcher3 = Pattern.compile(P_START_TAG, 40).matcher(sb);
            if (matcher3.find(i)) {
                i2 = matcher3.start();
                i6 = matcher3.group(1).length();
            } else {
                z = false;
            }
            if (i3 < i2 || !z) {
                i4--;
                i = i3;
                sb.delete(i, i7 + i);
            } else {
                i = i2;
                i4++;
                sb.delete(i, i6 + i);
            }
            i5++;
            if (i5 > 10) {
                z2 = false;
                break;
            }
            if (i4 <= 1) {
                break;
            }
        }
        if (z2) {
            replaceAll = sb.toString();
        }
        Matcher matcher4 = Pattern.compile(P_TAG_CLASS_ENCLOSED, 40).matcher(replaceAll);
        if (!matcher4.matches() || matcher4.group(1) == null) {
            return replaceAll;
        }
        return StringEscapeUtils.unescapeHtml(Pattern.compile(HTML_TAG, 40).matcher(matcher4.group(1).trim()).replaceAll("").replaceAll(MULTIPLE_WHITESPACE, " "));
    }

    private String extractPathFromOnclickNode(String str) {
        int indexOf = str.indexOf("'") + 1;
        return str.substring(indexOf, str.indexOf("'", indexOf));
    }

    private String extractPathFromHref(String str) {
        String str2 = "";
        Matcher matcher = Pattern.compile(BETA_DOWNLOAD_ID_PATTERN).matcher(str);
        if (matcher.find()) {
            str2 = BETA_DOWNLOAD_PATH + matcher.group(1) + "&expformat=bibtex";
        }
        return str2;
    }

    private List<String> extractSinglePath(Document document) {
        ArrayList arrayList = new ArrayList();
        NodeList elementsByTagName = document.getElementsByTagName("a");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Node item = elementsByTagName.item(i);
            if (item.getChildNodes().getLength() > 0 && BIBTEX_STRING_ON_ACM.equals(item.getChildNodes().item(0).getNodeValue())) {
                arrayList.add(extractPathFromOnclickNode(item.getAttributes().getNamedItem("onclick").getNodeValue()));
            }
        }
        return arrayList;
    }

    private List<String> extractSinglePathBeta(Document document) {
        ArrayList arrayList = new ArrayList();
        NodeList elementsByTagName = document.getElementsByTagName("a");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Node item = elementsByTagName.item(i);
            if (item.getChildNodes().getLength() > 0 && BIBTEX_STRING_ON_ACM.equals(item.getChildNodes().item(0).getNodeValue())) {
                arrayList.add(extractPathFromHref(item.getAttributes().getNamedItem("href").getNodeValue()));
            }
        }
        return arrayList;
    }

    private StringBuffer extractBibtexEntries(String str, List<String> list) throws MalformedURLException, IOException {
        StringBuffer stringBuffer = new StringBuffer();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            NodeList elementsByTagName = XmlUtils.getDOM(new URL(str + it.next())).getElementsByTagName("pre");
            for (int i = 0; i < elementsByTagName.getLength(); i++) {
                Node item = elementsByTagName.item(i);
                if (item.getChildNodes().getLength() > 0) {
                    stringBuffer.append(" " + item.getChildNodes().item(0).getNodeValue().trim());
                }
            }
        }
        return stringBuffer;
    }

    @Override // org.bibsonomy.scraper.Scraper
    public String getInfo() {
        return info;
    }

    @Override // org.bibsonomy.scraper.AbstractUrlScraper, org.bibsonomy.scraper.UrlScraper
    public List<Tuple<Pattern, Pattern>> getUrlPatterns() {
        return patterns;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteURL() {
        return SITE_URL;
    }
}
