package org.bibsonomy.scraper.generic;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.bibsonomy.scraper.Scraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.scraper.exceptions.ScrapingFailureException;
import org.bibsonomy.util.WebUtils;

/* loaded from: input_file:WEB-INF/lib/bibsonomy-scraper-2.0.32.jar:org/bibsonomy/scraper/generic/HighwireScraper.class */
public class HighwireScraper implements Scraper {
    private static final String SITE_NAME = "Highwire Scraper Collection";
    private static final String SITE_URL = "http://highwire.stanford.edu/lists/allsites.dtl";
    private static final String INFO = "This scraper parses a publication page from one of these <a href=\"http://highwire.stanford.edu/lists/allsites.dtl\">journals hosted by Highwire Press</a>  and extracts the adequate BibTeX entry.";
    private static final Pattern urlPattern = Pattern.compile("/cgi/citmgr\\?gca=[\\w+;/&=.-]+");

    @Override // org.bibsonomy.scraper.Scraper
    public boolean scrape(ScrapingContext scrapingContext) throws ScrapingException {
        if (scrapingContext.getUrl() == null) {
            return false;
        }
        try {
            Matcher matcher = urlPattern.matcher(scrapingContext.getPageContent());
            try {
                if (!matcher.find()) {
                    return false;
                }
                scrapingContext.setScraper(this);
                String contentAsString = WebUtils.getContentAsString(new URL("http://" + scrapingContext.getUrl().getHost() + matcher.group(0).replaceFirst("\\?", "?type=bibtex&")));
                Matcher matcher2 = Pattern.compile("@\\w+\\{.+,").matcher(contentAsString);
                while (matcher2.find()) {
                    String group = matcher2.group(0);
                    Matcher matcher3 = Pattern.compile("\\s").matcher(group);
                    if (matcher3.find()) {
                        contentAsString = contentAsString.replaceFirst(Pattern.quote(group), matcher3.replaceAll("_"));
                    }
                }
                if (contentAsString == null || "".equals(contentAsString)) {
                    throw new ScrapingFailureException("getting bibtex failed");
                }
                scrapingContext.setBibtexResult(contentAsString);
                return true;
            } catch (IOException e) {
                throw new InternalFailureException(e);
            }
        } catch (ScrapingException e2) {
            return false;
        }
    }

    @Override // org.bibsonomy.scraper.Scraper
    public Collection<Scraper> getScraper() {
        return Collections.singletonList(this);
    }

    @Override // org.bibsonomy.scraper.Scraper
    public boolean supportsScrapingContext(ScrapingContext scrapingContext) {
        if (scrapingContext.getUrl() == null) {
            return false;
        }
        try {
            return urlPattern.matcher(scrapingContext.getPageContent()).find();
        } catch (ScrapingException e) {
            return false;
        }
    }

    public static ScrapingContext getTestContext() {
        ScrapingContext scrapingContext = new ScrapingContext(null);
        try {
            scrapingContext.setUrl(new URL("http://mend.endojournals.org/cgi/gca?sendit=Get+All+Checked+Abstract(s)&gca=17%2F1%2F1"));
        } catch (MalformedURLException e) {
        }
        return scrapingContext;
    }

    @Override // org.bibsonomy.scraper.Scraper
    public String getInfo() {
        return INFO;
    }

    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    public String getSupportedSiteURL() {
        return SITE_URL;
    }
}
