package org.bibsonomy.scraper.url.kde.acm;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bibsonomy.common.Pair;
import org.bibsonomy.model.util.BibTexUtils;
import org.bibsonomy.scraper.AbstractUrlScraper;
import org.bibsonomy.scraper.CitedbyScraper;
import org.bibsonomy.scraper.ReferencesScraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.scraper.exceptions.ScrapingFailureException;
import org.bibsonomy.util.ValidationUtils;
import org.bibsonomy.util.WebUtils;
import org.bibsonomy.util.XmlUtils;
import org.bibsonomy.util.id.DOIUtils;
import org.springframework.web.util.HtmlUtils;
import org.w3c.dom.NodeList;

/* loaded from: input_file:org/bibsonomy/scraper/url/kde/acm/ACMBasicScraper.class */
public class ACMBasicScraper extends AbstractUrlScraper implements ReferencesScraper, CitedbyScraper {
    private static final String ACM_BASE_TAB_URL = "http://dl.acm.org/tab_";
    private static final String BROKEN_END;
    private static final Pattern URL_PARAM_ID_PATTERN;
    private static final Pattern DOI_URL_ID_PATTERN;
    private static final Pattern ABSTRACT_PATTERN;
    private static final String CLEANUP_ABSTRACT = "<[\\da-zA-Z\\s]*>|<\\s*/\\s*[\\da-zA-Z\\s]*>|\\r\\n|\\n";
    private static final Log log = LogFactory.getLog(ACMBasicScraper.class);
    private static final String SITE_URL = "http://portal.acm.org/";
    private static final String SITE_NAME = "ACM Digital Library";
    private static final String INFO = "This scraper parses a publication page from the " + href(SITE_URL, SITE_NAME);
    private static final List<Pair<Pattern, Pattern>> patterns = new LinkedList();

    @Override // org.bibsonomy.scraper.AbstractUrlScraper
    protected boolean scrapeInternal(ScrapingContext scrapingContext) throws ScrapingException {
        scrapingContext.setScraper(this);
        try {
            String query = scrapingContext.getUrl().getQuery();
            Matcher matcher = query == null ? DOI_URL_ID_PATTERN.matcher(scrapingContext.getUrl().toExternalForm()) : URL_PARAM_ID_PATTERN.matcher(query);
            if (matcher == null || !matcher.find()) {
                return false;
            }
            String group = matcher.group(2) != null ? matcher.group(2) : matcher.group(1);
            scrapingContext.getTmpMetadata().setId(group);
            HttpClient httpClient = WebUtils.getHttpClient();
            StringBuffer extractBibtexEntries = extractBibtexEntries(httpClient, SITE_URL, "exportformats.cfm?expformat=bibtex&id=" + group);
            String contentAsString = WebUtils.getContentAsString(httpClient, "http://portal.acm.org//tab_abstract.cfm?usebody=tabbody&id=" + group);
            if (ValidationUtils.present(contentAsString)) {
                Matcher matcher2 = ABSTRACT_PATTERN.matcher(contentAsString);
                if (matcher2.find()) {
                    String group2 = matcher2.group(2);
                    if (group2 != null) {
                        BibTexUtils.addFieldIfNotContained(extractBibtexEntries, "abstract", HtmlUtils.htmlUnescape(group2.replaceAll(CLEANUP_ABSTRACT, "")));
                    }
                } else {
                    log.info("ACMBasicScraper: Abstract not available");
                }
            } else {
                log.info("ACMBasicScraper: Abstract not available");
            }
            int indexOf = extractBibtexEntries.indexOf(BROKEN_END, (extractBibtexEntries.length() - BROKEN_END.length()) - 2);
            if (indexOf > 0) {
                extractBibtexEntries.replace(indexOf, extractBibtexEntries.length(), "}\n}");
            }
            String cleanDOI = DOIUtils.cleanDOI(extractBibtexEntries.toString().trim());
            if (!ValidationUtils.present(cleanDOI)) {
                throw new ScrapingFailureException("getting bibtex failed");
            }
            scrapingContext.setBibtexResult(cleanDOI);
            return true;
        } catch (Exception e) {
            throw new InternalFailureException(e);
        }
    }

    private static StringBuffer extractBibtexEntries(HttpClient httpClient, String str, String str2) throws MalformedURLException, IOException {
        StringBuffer stringBuffer = new StringBuffer();
        NodeList elementsByTagName = XmlUtils.getDOM(WebUtils.getContentAsString(httpClient, str + str2)).getElementsByTagName("pre");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            stringBuffer.append(XmlUtils.getText(elementsByTagName.item(i)));
        }
        return stringBuffer;
    }

    @Override // org.bibsonomy.scraper.Scraper
    public String getInfo() {
        return INFO;
    }

    @Override // org.bibsonomy.scraper.AbstractUrlScraper, org.bibsonomy.scraper.UrlScraper
    public List<Pair<Pattern, Pattern>> getUrlPatterns() {
        return patterns;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteURL() {
        return SITE_URL;
    }

    public static void main(String[] strArr) throws MalformedURLException, ScrapingException {
        for (String str : new String[]{"http://portal.acm.org/citation.cfm?id=1015428&amp;coll=Portal&amp;dl=ACM&amp;CFID=22531872&amp;CFTOKEN=18437036", "http://portal.acm.org/citation.cfm?id=333115.333119&amp;coll=GUIDE&amp;dl=GUIDE&amp;CFID=11052258&amp;CFTOKEN=84161555", "http://portal.acm.org/citation.cfm?id=1105676", "http://portal.acm.org/citation.cfm?id=553876", "http://portal.acm.org/beta/citation.cfm?id=359859", "http://portal.acm.org/citation.cfm?id=1082036.1082037&amp;coll=Portal&amp;dl=GUIDE&amp;CFID=88775871&amp;CFTOKEN=40392553#", "http://doi.acm.org/10.1145/1105664.1105676", "http://portal.acm.org/citation.cfm?id=500737.500755"}) {
            System.out.println("trying url " + str);
            ScrapingContext scrapingContext = new ScrapingContext(new URL(str));
            new ACMBasicScraper().scrape(scrapingContext);
            System.out.println("\n----------------------------------------\n");
            System.out.println(scrapingContext.getBibtexResult());
            System.out.println("----------------------------------------\n");
        }
    }

    @Override // org.bibsonomy.scraper.CitedbyScraper
    public boolean scrapeCitedby(ScrapingContext scrapingContext) throws ScrapingException {
        return scrapeMetaData(scrapingContext, "citings");
    }

    @Override // org.bibsonomy.scraper.ReferencesScraper
    public boolean scrapeReferences(ScrapingContext scrapingContext) throws ScrapingException {
        return scrapeMetaData(scrapingContext, "references");
    }

    private static boolean scrapeMetaData(ScrapingContext scrapingContext, String str) {
        try {
            String contentAsString = WebUtils.getContentAsString(WebUtils.getHttpClient(), ACM_BASE_TAB_URL + str + ".cfm?id=" + scrapingContext.getTmpMetadata().getId());
            if (!ValidationUtils.present(contentAsString)) {
                return false;
            }
            scrapingContext.setReferences(contentAsString);
            scrapingContext.setCitedBy(contentAsString);
            return true;
        } catch (Exception e) {
            log.error("error while scraping references by for " + scrapingContext.getUrl(), e);
            return false;
        }
    }

    static {
        patterns.add(new Pair<>(Pattern.compile(".*[(portal)(dl)].acm.org"), Pattern.compile("(/beta)?/citation.cfm.*")));
        patterns.add(new Pair<>(Pattern.compile(".*doi.acm.org"), EMPTY_PATTERN));
        BROKEN_END = new String("},\n}");
        URL_PARAM_ID_PATTERN = Pattern.compile("id=(\\d+(?:\\.(\\d+))?)");
        DOI_URL_ID_PATTERN = Pattern.compile("/(\\d+(?:\\.(\\d+))?)");
        ABSTRACT_PATTERN = Pattern.compile("<div style=\"display:inline\">(\\s*<p>\\s*)?((?s).+?)(\\s*<\\/p>\\s*)?<\\/div>", 8);
    }
}
