package org.bibsonomy.scraper.url.kde.citeseer;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.didion.jwnl.dictionary.database.DatabaseManagerImpl;
import org.bibsonomy.model.util.BibTexUtils;
import org.bibsonomy.scraper.AbstractUrlScraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.Tuple;
import org.bibsonomy.scraper.exceptions.PageNotSupportedException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.util.ValidationUtils;
import org.bibsonomy.util.WebUtils;
import org.springframework.beans.factory.xml.BeanDefinitionParserDelegate;

/* loaded from: input_file:WEB-INF/lib/bibsonomy-scraper-2.0.11.jar:org/bibsonomy/scraper/url/kde/citeseer/CiteseerxScraper.class */
public class CiteseerxScraper extends AbstractUrlScraper {
    private static final String HOST = "citeseerx.ist.psu.edu";
    private static final String SITE_URL = "http://citeseerx.ist.psu.edu/";
    private static final String SITE_NAME = "CiteSeerX";
    private static final String INFO = "This scraper parses a publication page from the Scientific Literature Digital Library and Search Engine " + href(SITE_URL, SITE_NAME);
    private static final Pattern bibtexPattern = Pattern.compile("<h2>BibTeX.*?</h2>\\s*<div class=\"content\">\\s*(@.*?)\\s*</div>", 40);
    private static final Pattern abstractPattern = Pattern.compile("Abstract:.*?<p class=\"para4\">(.*?)</p>", 40);
    private static final Pattern brokenUrlFixPattern = Pattern.compile(".*summary\\d+.*");
    private static final List<Tuple<Pattern, Pattern>> patterns = Collections.singletonList(new Tuple(Pattern.compile(".*citeseerx.ist.psu.edu"), AbstractUrlScraper.EMPTY_PATTERN));

    @Override // org.bibsonomy.scraper.Scraper
    public String getInfo() {
        return INFO;
    }

    @Override // org.bibsonomy.scraper.AbstractUrlScraper
    protected boolean scrapeInternal(ScrapingContext scrapingContext) throws ScrapingException {
        scrapingContext.setScraper(this);
        try {
            WebUtils.getContentAsString(scrapingContext.getUrl().toString());
        } catch (IOException e) {
            String url = scrapingContext.getUrl().toString();
            if (brokenUrlFixPattern.matcher(url).matches()) {
                try {
                    scrapingContext.setUrl(new URL(url.replace("summary", "summary?doi=")));
                } catch (MalformedURLException e2) {
                    throw new ScrapingException("Couldn't build new URL");
                }
            }
        }
        if (ValidationUtils.present(scrapingContext.getSelectedText())) {
            scrapingContext.setBibtexResult(scrapingContext.getSelectedText());
            scrapingContext.setScraper(this);
            return true;
        }
        String pageContent = scrapingContext.getPageContent();
        Matcher matcher = bibtexPattern.matcher(pageContent);
        if (!matcher.find()) {
            throw new PageNotSupportedException("no bibtex snippet available");
        }
        String replace = matcher.group(1).replace("<br/>", "\n").replace("&nbsp;", " ");
        Matcher matcher2 = abstractPattern.matcher(pageContent);
        if (matcher2.find()) {
            replace = BibTexUtils.addFieldIfNotContained(replace, BeanDefinitionParserDelegate.ABSTRACT_ATTRIBUTE, matcher2.group(1));
        }
        scrapingContext.setBibtexResult(BibTexUtils.addFieldIfNotContained(replace, DatabaseManagerImpl.URL, scrapingContext.getUrl().toString()));
        return true;
    }

    @Override // org.bibsonomy.scraper.AbstractUrlScraper, org.bibsonomy.scraper.UrlScraper
    public List<Tuple<Pattern, Pattern>> getUrlPatterns() {
        return patterns;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    @Override // org.bibsonomy.scraper.UrlScraper
    public String getSupportedSiteURL() {
        return SITE_URL;
    }
}
