package org.bibsonomy.scraper.url.kde.science;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.bibsonomy.model.util.BibTexUtils;
import org.bibsonomy.scraper.AbstractUrlScraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.Tuple;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.PageNotSupportedException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.scraper.exceptions.ScrapingFailureException;
import org.bibsonomy.util.WebUtils;
import org.bibsonomy.util.id.DOIUtils;

/* loaded from: input_file:WEB-INF/lib/bibsonomy-scraper-2.0.15.jar:org/bibsonomy/scraper/url/kde/science/ScienceDirectScraper.class */
public class ScienceDirectScraper extends AbstractUrlScraper {
    private static final String SCIENCE_CITATION_HOST = "sciencedirect.com";
    private static final String SCIENCE_CITATION_PATH = "/science";
    private static final String SCIENCE_CITATION_URL = "http://www.sciencedirect.com/science";
    private static final String KEYWORD_DELIMITER = ", ";
    private static final String SITE_URL = "http://www.sciencedirect.com/";
    private static final String SITE_NAME = "ScienceDirect";
    private static final String info = "This scraper parses a publication page from " + href(SITE_URL, SITE_NAME) + ".";
    private static final String PATTERN_DOWNLOAD_PAGE_LINK = "<a href=\"(/science\\?_ob=DownloadURL[^\"]*)\"";
    private static final Pattern patternDownload = Pattern.compile(PATTERN_DOWNLOAD_PAGE_LINK);
    private static final String PATTERN_ACCT = "<input type=hidden name=_acct value=([^>]*)>";
    private static final Pattern patternAcct = Pattern.compile(PATTERN_ACCT);
    private static final String PATTERN_ARTICLE_LIST_ID = "<input type=hidden name=_ArticleListID value=(.+?)>";
    private static final Pattern patternArList = Pattern.compile(PATTERN_ARTICLE_LIST_ID);
    private static final String PATTERN_USER_ID = "<input type=hidden name=_userid value=(.+?)>";
    private static final Pattern patternUserId = Pattern.compile(PATTERN_USER_ID);
    private static final String PATTERN_UIOKEY = "<input type=hidden name=_uoikey value=(.+?)>";
    private static final Pattern patternUiokey = Pattern.compile(PATTERN_UIOKEY);
    private static final String PATTERN_MD5 = "<input type=hidden name=md5 value=(.+?)>";
    private static final Pattern patternMD5 = Pattern.compile(PATTERN_MD5);
    private static final String PATTERN_KEYWORDS = "keywords = \"(.+)\"";
    private static final Pattern patternKeywords = Pattern.compile(PATTERN_KEYWORDS);
    private static final String PATTERN_QUOTE_START = "\\s*=\\s*\"";
    private static final Pattern patternQuoteStart = Pattern.compile(PATTERN_QUOTE_START, 8);
    private static final String PATTERN_QUOTE_END = "\\\"\\s*,\\s*$|\\\"\\s*$";
    private static final Pattern patternQuoteEnd = Pattern.compile(PATTERN_QUOTE_END, 8);
    private static final Pattern patternBrokenPages = Pattern.compile("(.*pages = \"[0-9]+) - ([0-9]+\".*)", 32);
    private static final List<Tuple<Pattern, Pattern>> patterns = Collections.singletonList(new Tuple(Pattern.compile(".*sciencedirect.com"), Pattern.compile("/science.*")));

    @Override // org.bibsonomy.scraper.AbstractUrlScraper
    protected boolean scrapeInternal(ScrapingContext scrapingContext) throws ScrapingException {
        scrapingContext.setScraper(this);
        try {
            String str = null;
            URL url = scrapingContext.getUrl();
            if (!url.getPath().startsWith(SCIENCE_CITATION_PATH)) {
                throw new PageNotSupportedException("This page is currently not supported.");
            }
            if (url.getQuery() == null || url.getQuery().contains("_ob=ArticleURL")) {
                Matcher matcher = patternDownload.matcher(scrapingContext.getPageContent());
                if (matcher.find()) {
                    str = matcher.group(1);
                }
            } else if (url.getQuery().contains("_ob=DownloadURL")) {
                str = url.toString();
            }
            String contentAsString = WebUtils.getContentAsString(new URL("http://www.sciencedirect.com" + str));
            String str2 = null;
            Matcher matcher2 = patternAcct.matcher(contentAsString);
            if (matcher2.find()) {
                str2 = matcher2.group(1);
            }
            Matcher matcher3 = patternArList.matcher(contentAsString);
            String group = matcher3.find() ? matcher3.group(1) : "";
            String str3 = null;
            Matcher matcher4 = patternUserId.matcher(contentAsString);
            if (matcher4.find()) {
                str3 = matcher4.group(1);
            }
            String str4 = null;
            Matcher matcher5 = patternUiokey.matcher(contentAsString);
            if (matcher5.find()) {
                str4 = matcher5.group(1);
            }
            String str5 = null;
            Matcher matcher6 = patternMD5.matcher(contentAsString);
            if (matcher6.find()) {
                str5 = matcher6.group(1);
            }
            if (str2 == null || str3 == null || str4 == null || str5 == null) {
                throw new ScrapingFailureException("Needed ID is missing.");
            }
            String postContentAsString = WebUtils.getPostContentAsString(new URL(SCIENCE_CITATION_URL), "_ob=DownloadURL&_method=finish&_acct=" + str2 + "&_userid=" + str3 + "&_docType=FLA&_ArticleListID=" + group + "&_uoikey=" + str4 + "&count=1&md5=" + str5 + "&JAVASCRIPT_ON=Y&format=cite-abs&citation-type=BIBTEX&Export=Export&RETURN_URL=http%3A%2F%2Fwww.sciencedirect.com%2Fscience%2Fhome", "latin1");
            StringBuilder sb = new StringBuilder();
            Matcher matcher7 = patternKeywords.matcher(postContentAsString);
            while (matcher7.find()) {
                sb.append(matcher7.group(1));
                sb.append(KEYWORD_DELIMITER);
                postContentAsString = postContentAsString.contains(new StringBuilder().append(matcher7.group()).append(",").toString()) ? postContentAsString.replace(matcher7.group() + ",", "") : postContentAsString.replace(matcher7.group(), "");
            }
            int lastIndexOf = sb.lastIndexOf(KEYWORD_DELIMITER);
            if (lastIndexOf > 0) {
                sb.delete(lastIndexOf, lastIndexOf + 1);
            }
            String cleanBibTeX = cleanBibTeX(BibTexUtils.addFieldIfNotContained(patternQuoteEnd.matcher(patternQuoteStart.matcher(postContentAsString).replaceAll(" = {")).replaceAll("},"), BibTexUtils.ADDITIONAL_MISC_FIELD_KEYWORDS, sb.toString()));
            if (cleanBibTeX == null || cleanBibTeX.trim().equals("")) {
                throw new ScrapingFailureException("getting bibtex failed");
            }
            scrapingContext.setBibtexResult(cleanBibTeX);
            return true;
        } catch (MalformedURLException e) {
            throw new InternalFailureException(e);
        } catch (IOException e2) {
            throw new InternalFailureException(e2);
        }
    }

    protected String cleanBibTeX(String str) {
        if (str == null) {
            return null;
        }
        String replace = str.replace("\r", "");
        Matcher matcher = patternBrokenPages.matcher(replace);
        if (matcher.matches()) {
            replace = matcher.replaceFirst("$1--$2");
        }
        return DOIUtils.cleanDOI(replace);
    }

    @Override // org.bibsonomy.scraper.Scraper
    public String getInfo() {
        return info;
    }

    @Override // org.bibsonomy.scraper.AbstractUrlScraper, org.bibsonomy.scraper.UrlScraper
    public List<Tuple<Pattern, Pattern>> getUrlPatterns() {
        return patterns;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteURL() {
        return SITE_URL;
    }
}
