Skip to content

paperscraper.pdf

paperscraper.pdf

fallbacks

Functionalities to scrape PDF files of publications.

fallback_wiley_api(paper_metadata: Dict[str, Any], output_path: Path, api_keys: Dict[str, str], max_attempts: int = 2) -> bool

Attempt to download the PDF via the Wiley TDM API (popular publisher which blocks standard scraping attempts; API access free for academic users).

This function uses the WILEY_TDM_API_TOKEN environment variable to authenticate with the Wiley TDM API and attempts to download the PDF for the given paper. See https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining for a description on how to get your WILEY_TDM_API_TOKEN.

Parameters:

Name Type Description Default
paper_metadata dict

Dictionary containing paper metadata. Must include the 'doi' key.

required
output_path Path

A pathlib.Path object representing the path where the PDF will be saved.

required
api_keys dict

Preloaded API keys.

required
max_attempts int

The maximum number of attempts to retry API call.

2

Returns:

Name Type Description
bool bool

True if the PDF file was successfully downloaded, False otherwise.

Source code in paperscraper/pdf/fallbacks.py
def fallback_wiley_api(
    paper_metadata: Dict[str, Any],
    output_path: Path,
    api_keys: Dict[str, str],
    max_attempts: int = 2,
) -> bool:
    """
    Attempt to download the PDF via the Wiley TDM API (popular publisher which blocks standard scraping attempts; API access free for academic users).

    This function uses the WILEY_TDM_API_TOKEN environment variable to authenticate
    with the Wiley TDM API and attempts to download the PDF for the given paper.
    See https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining for a description on how to get your WILEY_TDM_API_TOKEN.

    Args:
        paper_metadata (dict): Dictionary containing paper metadata. Must include the 'doi' key.
        output_path (Path): A pathlib.Path object representing the path where the PDF will be saved.
        api_keys (dict): Preloaded API keys.
        max_attempts (int): The maximum number of attempts to retry API call.

    Returns:
        bool: True if the PDF file was successfully downloaded, False otherwise.
    """

    WILEY_TDM_API_TOKEN = api_keys.get("WILEY_TDM_API_TOKEN")
    encoded_doi = paper_metadata["doi"].replace("/", "%2F")
    api_url = f"https://api.wiley.com/onlinelibrary/tdm/v1/articles/{encoded_doi}"
    headers = {"Wiley-TDM-Client-Token": WILEY_TDM_API_TOKEN}

    attempt = 0
    success = False

    while attempt < max_attempts:
        try:
            api_response = requests.get(
                api_url, headers=headers, allow_redirects=True, timeout=60
            )
            api_response.raise_for_status()
            if api_response.content[:4] != b"%PDF":
                logger.warning(
                    f"API returned content that is not a valid PDF for {paper_metadata['doi']}."
                )
            else:
                with open(output_path.with_suffix(".pdf"), "wb+") as f:
                    f.write(api_response.content)
                logger.info(
                    f"Successfully downloaded PDF via Wiley API for {paper_metadata['doi']}."
                )
                success = True
                break
        except Exception as e2:
            if attempt < max_attempts - 1:
                logger.info("Waiting 20 seconds before retrying...")
                time.sleep(20)
            logger.error(
                f"Could not download via Wiley API (attempt {attempt + 1}/{max_attempts}): {e2}"
            )

        attempt += 1

    # **Mandatory delay of 10 seconds to comply with Wiley API rate limits**
    logger.info(
        "Waiting 10 seconds before next request to comply with Wiley API rate limits..."
    )
    time.sleep(10)
    return success

fallback_bioc_pmc(doi: str, output_path: Path) -> bool

Attempt to download the XML via the BioC-PMC fallback.

This function first converts a given DOI to a PMCID using the NCBI ID Converter API. If a PMCID is found, it constructs the corresponding PMC XML URL and attempts to download the full-text XML.

PubMed Central® (PMC) is a free full-text archive of biomedical and life sciences journal literature at the U.S. National Institutes of Health's National Library of Medicine (NIH/NLM).

Parameters:

Name Type Description Default
doi str

The DOI of the paper to retrieve.

required
output_path Path

A pathlib.Path object representing the path where the XML file will be saved.

required

Returns:

Name Type Description
bool bool

True if the XML file was successfully downloaded, False otherwise.

Source code in paperscraper/pdf/fallbacks.py
def fallback_bioc_pmc(doi: str, output_path: Path) -> bool:
    """
    Attempt to download the XML via the BioC-PMC fallback.

    This function first converts a given DOI to a PMCID using the NCBI ID Converter API.
    If a PMCID is found, it constructs the corresponding PMC XML URL and attempts to
    download the full-text XML.

    PubMed Central® (PMC) is a free full-text archive of biomedical and life sciences
    journal literature at the U.S. National Institutes of Health's National Library of Medicine (NIH/NLM).

    Args:
        doi (str): The DOI of the paper to retrieve.
        output_path (Path): A pathlib.Path object representing the path where the XML file will be saved.

    Returns:
        bool: True if the XML file was successfully downloaded, False otherwise.
    """
    ncbi_tool = "paperscraper"
    ncbi_email = "your_email@example.com"

    converter_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
    params = {
        "tool": ncbi_tool,
        "email": ncbi_email,
        "ids": doi,
        "idtype": "doi",
        "format": "json",
    }
    try:
        conv_response = requests.get(converter_url, params=params, timeout=60)
        conv_response.raise_for_status()
        data = conv_response.json()
        records = data.get("records", [])
        if not records or "pmcid" not in records[0]:
            logger.warning(
                f"No PMCID available for DOI {doi}. Fallback via PMC therefore not possible."
            )
            return False
        pmcid = records[0]["pmcid"]
        logger.info(f"Converted DOI {doi} to PMCID {pmcid}.")
    except Exception as conv_err:
        logger.error(f"Error during DOI to PMCID conversion: {conv_err}")
        return False

    # Construct PMC XML URL
    xml_url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/{pmcid}/unicode"
    logger.info(f"Attempting to download XML from BioC-PMC URL: {xml_url}")
    try:
        xml_response = requests.get(xml_url, timeout=60)
        xml_response.raise_for_status()
        xml_path = output_path.with_suffix(".xml")
        # check for xml error:
        if xml_response.content.startswith(
            b"[Error] : No result can be found. <BR><HR><B> - https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/"
        ):
            logger.warning(f"No XML found for DOI {doi} at BioC-PMC URL {xml_url}.")
            return False
        with open(xml_path, "wb+") as f:
            f.write(xml_response.content)
        logger.info(f"Successfully downloaded XML for DOI {doi} to {xml_path}.")
        return True
    except Exception as xml_err:
        logger.error(f"Failed to download XML from BioC-PMC URL {xml_url}: {xml_err}")
        return False

fallback_elsevier_api(paper_metadata: Dict[str, Any], output_path: Path, api_keys: Dict[str, str]) -> bool

Attempt to download the full text via the Elsevier TDM API. For more information, see: https://www.elsevier.com/about/policies-and-standards/text-and-data-mining (Requires an institutional subscription and an API key provided in the api_keys dictionary under the key "ELSEVIER_TDM_API_KEY".)

Parameters:

Name Type Description Default
paper_metadata Dict[str, Any]

Dictionary containing paper metadata. Must include the 'doi' key.

required
output_path Path

A pathlib.Path object representing the path where the XML file will be saved.

required
api_keys Dict[str, str]

A dictionary containing API keys. Must include the key "ELSEVIER_TDM_API_KEY".

required

Returns:

Name Type Description
bool bool

True if the XML file was successfully downloaded, False otherwise.

Source code in paperscraper/pdf/fallbacks.py
def fallback_elsevier_api(
    paper_metadata: Dict[str, Any], output_path: Path, api_keys: Dict[str, str]
) -> bool:
    """
    Attempt to download the full text via the Elsevier TDM API.
    For more information, see:
    https://www.elsevier.com/about/policies-and-standards/text-and-data-mining
    (Requires an institutional subscription and an API key provided in the api_keys dictionary under the key "ELSEVIER_TDM_API_KEY".)

    Args:
        paper_metadata (Dict[str, Any]): Dictionary containing paper metadata. Must include the 'doi' key.
        output_path (Path): A pathlib.Path object representing the path where the XML file will be saved.
        api_keys (Dict[str, str]): A dictionary containing API keys. Must include the key "ELSEVIER_TDM_API_KEY".

    Returns:
        bool: True if the XML file was successfully downloaded, False otherwise.
    """
    elsevier_api_key = api_keys.get("ELSEVIER_TDM_API_KEY")
    doi = paper_metadata["doi"]
    api_url = f"https://api.elsevier.com/content/article/doi/{doi}?apiKey={elsevier_api_key}&httpAccept=text%2Fxml"
    logger.info(f"Attempting download via Elsevier API (XML) for {doi}: {api_url}")
    headers = {"Accept": "application/xml"}
    try:
        response = requests.get(api_url, headers=headers, timeout=60)

        # Check for 401 error and look for APIKEY_INVALID in the response
        if response.status_code == 401:
            error_text = response.text
            if "APIKEY_INVALID" in error_text:
                logger.error("Invalid API key. Couldn't download via Elsevier XML API")
            else:
                logger.error("401 Unauthorized. Couldn't download via Elsevier XML API")
            return False

        response.raise_for_status()

        # Attempt to parse it with lxml to confirm it's valid XML
        try:
            etree.fromstring(response.content)
        except etree.XMLSyntaxError as e:
            logger.warning(f"Elsevier API returned invalid XML for {doi}: {e}")
            return False

        xml_path = output_path.with_suffix(".xml")
        with open(xml_path, "wb") as f:
            f.write(response.content)
        logger.info(
            f"Successfully used Elsevier API to downloaded XML for {doi} to {xml_path}"
        )
        return True
    except Exception as e:
        logger.error(f"Could not download via Elsevier XML API: {e}")
        return False

fallback_elife_xml(doi: str, output_path: Path) -> bool

Attempt to download the XML via the eLife XML repository on GitHub.

eLife provides open access to their XML files on GitHub, which can be used as a fallback. When multiple versions exist (revised papers), it takes the latest version (e.g., v3 instead of v1).

Parameters:

Name Type Description Default
doi str

The DOI of the eLife paper to download.

required
output_path Path

A pathlib.Path object representing the path where the XML file will be saved.

required

Returns:

Name Type Description
bool bool

True if the XML file was successfully downloaded, False otherwise.

Source code in paperscraper/pdf/fallbacks.py
def fallback_elife_xml(doi: str, output_path: Path) -> bool:
    """
    Attempt to download the XML via the eLife XML repository on GitHub.

    eLife provides open access to their XML files on GitHub, which can be used as a fallback.
    When multiple versions exist (revised papers), it takes the latest version (e.g., v3 instead of v1).

    Args:
        doi (str): The DOI of the eLife paper to download.
        output_path (Path): A pathlib.Path object representing the path where the XML file will be saved.

    Returns:
        bool: True if the XML file was successfully downloaded, False otherwise.
    """
    parts = doi.split("eLife.")
    if len(parts) < 2:
        logger.error(f"Unable to parse eLife DOI: {doi}")
        return False
    article_num = parts[1].strip()

    index = get_elife_xml_index()
    if article_num not in index:
        logger.warning(f"No eLife XML found for DOI {doi}.")
        return False
    candidate_files = index[article_num]
    latest_version, latest_download_url = max(candidate_files, key=lambda x: x[0])
    try:
        r = requests.get(latest_download_url, timeout=60)
        r.raise_for_status()
        latest_xml = r.content
    except Exception as e:
        logger.error(f"Error downloading file from {latest_download_url}: {e}")
        return False

    xml_path = output_path.with_suffix(".xml")
    with open(xml_path, "wb") as f:
        f.write(latest_xml)
    logger.info(
        f"Successfully downloaded XML via eLife API ({latest_version}) for DOI {doi} to {xml_path}."
    )
    return True

get_elife_xml_index() -> dict

Fetch the eLife XML index from GitHub and return it as a dictionary.

This function retrieves and caches the list of available eLife articles in XML format from the eLife GitHub repository. It ensures that the latest version of each article is accessible for downloading. The index is cached in memory to avoid repeated network requests when processing multiple eLife papers.

Returns:

Name Type Description
dict dict

A dictionary where keys are article numbers (as strings) and values are lists of tuples (version, download_url). Each list is sorted by version number.

Source code in paperscraper/pdf/fallbacks.py
def get_elife_xml_index() -> dict:
    """
    Fetch the eLife XML index from GitHub and return it as a dictionary.

    This function retrieves and caches the list of available eLife articles in XML format
    from the eLife GitHub repository. It ensures that the latest version of each article
    is accessible for downloading. The index is cached in memory to avoid repeated
    network requests when processing multiple eLife papers.

    Returns:
        dict: A dictionary where keys are article numbers (as strings) and values are
              lists of tuples (version, download_url). Each list is sorted by version number.
    """
    global ELIFE_XML_INDEX
    if ELIFE_XML_INDEX is None:
        logger.info("Fetching eLife XML index from GitHub using git tree API")
        ELIFE_XML_INDEX = {}
        # Use the git tree API to get the full repository tree.
        base_tree_url = "https://api.github.com/repos/elifesciences/elife-article-xml/git/trees/master?recursive=1"
        r = requests.get(base_tree_url, timeout=60)
        r.raise_for_status()
        tree_data = r.json()
        items = tree_data.get("tree", [])
        # Look for files in the 'articles' directory matching the pattern.
        pattern = r"articles/elife-(\d+)-v(\d+)\.xml"
        for item in items:
            path = item.get("path", "")
            match = re.match(pattern, path)
            if match:
                article_num_padded = match.group(1)
                version = int(match.group(2))
                # Construct the raw download URL.
                download_url = f"https://raw.githubusercontent.com/elifesciences/elife-article-xml/master/{path}"
                ELIFE_XML_INDEX.setdefault(article_num_padded, []).append(
                    (version, download_url)
                )
        # Sort each article's file list by version.
        for key in ELIFE_XML_INDEX:
            ELIFE_XML_INDEX[key].sort(key=lambda x: x[0])
    return ELIFE_XML_INDEX

month_folder(doi: str) -> str

Query bioRxiv API to get the posting date of a given DOI. Convert a date to the BioRxiv S3 folder name, rolling over if it's the month's last day. E.g., if date is the last day of April, treat as May_YYYY.

Parameters:

Name Type Description Default
doi str

The DOI for which to retrieve the date.

required

Returns:

Type Description
str

Month and year in format October_2019

Source code in paperscraper/pdf/fallbacks.py
def month_folder(doi: str) -> str:
    """
    Query bioRxiv API to get the posting date of a given DOI.
    Convert a date to the BioRxiv S3 folder name, rolling over if it's the month's last day.
    E.g., if date is the last day of April, treat as May_YYYY.

    Args:
        doi: The DOI for which to retrieve the date.

    Returns:
        Month and year in format `October_2019`
    """
    url = f"https://api.biorxiv.org/details/biorxiv/{doi}/na/json"
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    date_str = resp.json()["collection"][0]["date"]
    date = datetime.date.fromisoformat(date_str)

    # NOTE: bioRxiv papers posted on the last day of the month are archived the next day
    last_day = calendar.monthrange(date.year, date.month)[1]
    if date.day == last_day:
        date = date + datetime.timedelta(days=1)
    return date.strftime("%B_%Y")

list_meca_keys(s3_client: BaseClient, bucket: str, prefix: str) -> list

List all .meca object keys under a given prefix in a requester-pays bucket.

Parameters:

Name Type Description Default
s3_client BaseClient

S3 client to get the data from.

required
bucket str

bucket to get data from.

required
prefix str

prefix to get data from.

required

Returns:

Type Description
list

List of keys, one per existing .meca in the bucket.

Source code in paperscraper/pdf/fallbacks.py
def list_meca_keys(s3_client: BaseClient, bucket: str, prefix: str) -> list:
    """
    List all .meca object keys under a given prefix in a requester-pays bucket.

    Args:
        s3_client: S3 client to get the data from.
        bucket: bucket to get data from.
        prefix: prefix to get data from.

    Returns:
        List of keys, one per existing .meca in the bucket.
    """
    keys = []
    paginator = s3_client.get_paginator("list_objects_v2")
    for page in paginator.paginate(
        Bucket=bucket, Prefix=prefix, RequestPayer="requester"
    ):
        for obj in page.get("Contents", []):
            if obj["Key"].endswith(".meca"):
                keys.append(obj["Key"])
    return keys

find_meca_for_doi(s3_client: BaseClient, bucket: str, key: str, doi_token: str) -> bool

Efficiently inspect manifest.xml within a .meca zip by fetching only necessary bytes. Parse via ZipFile to read manifest.xml and match DOI token.

Parameters:

Name Type Description Default
s3_client BaseClient

S3 client to get the data from.

required
bucket str

bucket to get data from.

required
key str

prefix to get data from.

required
doi_token str

the DOI that should be matched

required

Returns:

Type Description
bool

Whether or not the DOI could be matched

Source code in paperscraper/pdf/fallbacks.py
def find_meca_for_doi(
    s3_client: BaseClient, bucket: str, key: str, doi_token: str
) -> bool:
    """
    Efficiently inspect manifest.xml within a .meca zip by fetching only necessary bytes.
    Parse via ZipFile to read manifest.xml and match DOI token.

    Args:
        s3_client: S3 client to get the data from.
        bucket: bucket to get data from.
        key: prefix to get data from.
        doi_token: the DOI that should be matched

    Returns:
        Whether or not the DOI could be matched
    """
    try:
        head = s3_client.get_object(
            Bucket=bucket, Key=key, Range="bytes=0-4095", RequestPayer="requester"
        )["Body"].read()
        tail = s3_client.get_object(
            Bucket=bucket, Key=key, Range="bytes=-4096", RequestPayer="requester"
        )["Body"].read()
    except Exception:
        return False

    data = head + tail
    with zipfile.ZipFile(io.BytesIO(data)) as z:
        manifest = z.read("manifest.xml")

    # Extract the last part of the DOI (newer DOIs that contain date fail otherwise)
    doi_token = doi_token.split(".")[-1]
    return doi_token.encode("utf-8") in manifest.lower()

fallback_s3(doi: str, output_path: Union[str, Path], api_keys: dict, workers: int = 32) -> bool

Download a BioRxiv PDF via the requester-pays S3 bucket using range requests.

Parameters:

Name Type Description Default
doi str

The DOI for which to retrieve the PDF (e.g. '10.1101/798496').

required
output_path Union[str, Path]

Path where the PDF will be saved (with .pdf suffix added).

required
api_keys dict

Dict containing 'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY'.

required

Returns:

Type Description
bool

True if download succeeded, False otherwise.

Source code in paperscraper/pdf/fallbacks.py
def fallback_s3(
    doi: str, output_path: Union[str, Path], api_keys: dict, workers: int = 32
) -> bool:
    """
    Download a BioRxiv PDF via the requester-pays S3 bucket using range requests.

    Args:
        doi: The DOI for which to retrieve the PDF (e.g. '10.1101/798496').
        output_path: Path where the PDF will be saved (with .pdf suffix added).
        api_keys: Dict containing 'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY'.

    Returns:
        True if download succeeded, False otherwise.
    """

    s3 = boto3.client(
        "s3",
        aws_access_key_id=api_keys.get("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=api_keys.get("AWS_SECRET_ACCESS_KEY"),
        region_name="us-east-1",
    )
    bucket = "biorxiv-src-monthly"

    # Derive prefix from DOI date
    prefix = f"Current_Content/{month_folder(doi)}/"

    # List MECA archives in that month
    meca_keys = list_meca_keys(s3, bucket, prefix)
    if not meca_keys:
        return False

    token = doi.split("/")[-1].lower()
    target = None
    executor = ThreadPoolExecutor(max_workers=32)
    futures = {
        executor.submit(find_meca_for_doi, s3, bucket, key, token): key
        for key in meca_keys
    }
    target = None
    pbar = tqdm(
        total=len(futures),
        desc=f"Scanning in biorxiv with {workers} workers for {doi}…",
    )
    for future in as_completed(futures):
        key = futures[future]
        try:
            if future.result():
                target = key
                pbar.set_description(f"Success! Found target {doi} in {key}")
                # cancel pending futures to speed shutdown
                for fut in futures:
                    fut.cancel()
                break
        except Exception:
            pass
        finally:
            pbar.update(1)
    # shutdown without waiting for remaining threads
    executor.shutdown(wait=False)
    if target is None:
        logger.error(f"Could not find {doi} on biorxiv")
        return False

    # Download full MECA and extract PDF
    data = s3.get_object(Bucket=bucket, Key=target, RequestPayer="requester")[
        "Body"
    ].read()
    output_path = Path(output_path)
    with zipfile.ZipFile(io.BytesIO(data)) as z:
        for name in z.namelist():
            if name.lower().endswith(".pdf"):
                z.extract(name, path=output_path.parent)
                # Move file to desired location
                (output_path.parent / name).rename(output_path.with_suffix(".pdf"))
                return True
    return False

pdf

Functionalities to scrape PDF files of publications.

save_pdf(paper_metadata: Dict[str, Any], filepath: Union[str, Path], save_metadata: bool = False, api_keys: Optional[Union[str, Dict[str, str]]] = None) -> None

Save a PDF file of a paper.

Parameters:

Name Type Description Default
paper_metadata Dict[str, Any]

A dictionary with the paper metadata. Must contain the doi key.

required
filepath Union[str, Path]

Path to the PDF file to be saved (with or without suffix).

required
save_metadata bool

A boolean indicating whether to save paper metadata as a separate json.

False
api_keys Optional[Union[str, Dict[str, str]]]

Either a dictionary containing API keys (if already loaded) or a string (path to API keys file). If None, will try to load from .env file and if unsuccessful, skip API-based fallbacks.

None
Source code in paperscraper/pdf/pdf.py
def save_pdf(
    paper_metadata: Dict[str, Any],
    filepath: Union[str, Path],
    save_metadata: bool = False,
    api_keys: Optional[Union[str, Dict[str, str]]] = None,
) -> None:
    """
    Save a PDF file of a paper.

    Args:
        paper_metadata: A dictionary with the paper metadata. Must contain the `doi` key.
        filepath: Path to the PDF file to be saved (with or without suffix).
        save_metadata: A boolean indicating whether to save paper metadata as a separate json.
        api_keys: Either a dictionary containing API keys (if already loaded) or a string (path to API keys file).
                  If None, will try to load from `.env` file and if unsuccessful, skip API-based fallbacks.
    """
    if not isinstance(paper_metadata, Dict):
        raise TypeError(f"paper_metadata must be a dict, not {type(paper_metadata)}.")
    if "doi" not in paper_metadata.keys():
        raise KeyError("paper_metadata must contain the key 'doi'.")
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}.")

    output_path = Path(filepath)

    if not Path(output_path).parent.exists():
        raise ValueError(f"The folder: {output_path} seems to not exist.")

    # load API keys from file if not already loaded via in save_pdf_from_dump (dict)
    if not isinstance(api_keys, dict):
        api_keys = load_api_keys(api_keys)

    doi = paper_metadata["doi"]
    url = f"https://doi.org/{doi}"
    success = False
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        success = True
    except Exception as e:
        error = str(e)
        logger.warning(f"Could not download from: {url} - {e}. ")

    if not success and "biorxiv" in error:
        if (
            api_keys.get("AWS_ACCESS_KEY_ID") is None
            or api_keys.get("AWS_SECRET_ACCESS_KEY") is None
        ):
            logger.info(
                "BiorXiv PDFs can be downloaded from a S3 bucket with a requester-pay option. "
                "Consider setting `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` to use this option. "
                "Pricing is a few cent per GB, thus each request costs < 0.1 cents. "
                "For details see: https://www.biorxiv.org/tdm"
            )
        else:
            success = FALLBACKS["s3"](doi, output_path, api_keys)
            if success:
                return

    if not success:
        # always first try fallback to BioC-PMC (open access papers on PubMed Central)
        success = FALLBACKS["bioc_pmc"](doi, output_path)

        # if BioC-PMC fails, try other fallbacks
        if not success:
            # check for specific publishers
            if "elife" in error.lower():  # elife has an open XML repository on GitHub
                FALLBACKS["elife"](doi, output_path)
            elif (
                ("wiley" in error.lower())
                and api_keys
                and ("WILEY_TDM_API_TOKEN" in api_keys)
            ):
                FALLBACKS["wiley"](paper_metadata, output_path, api_keys)
        return

    soup = BeautifulSoup(response.text, features="lxml")
    meta_pdf = soup.find("meta", {"name": "citation_pdf_url"})
    if meta_pdf and meta_pdf.get("content"):
        pdf_url = meta_pdf.get("content")
        try:
            response = requests.get(pdf_url, timeout=60)
            response.raise_for_status()

            if response.content[:4] != b"%PDF":
                logger.warning(
                    f"The file from {url} does not appear to be a valid PDF."
                )
                success = FALLBACKS["bioc_pmc"](doi, output_path)
                if not success:
                    # Check for specific publishers
                    if "elife" in doi.lower():
                        logger.info("Attempting fallback to eLife XML repository")
                        FALLBACKS["elife"](doi, output_path)
                    elif api_keys and "WILEY_TDM_API_TOKEN" in api_keys:
                        FALLBACKS["wiley"](paper_metadata, output_path, api_keys)
                    elif api_keys and "ELSEVIER_TDM_API_KEY" in api_keys:
                        FALLBACKS["elsevier"](paper_metadata, output_path, api_keys)
            else:
                with open(output_path.with_suffix(".pdf"), "wb+") as f:
                    f.write(response.content)
        except Exception as e:
            logger.warning(f"Could not download {pdf_url}: {e}")
    else:  # if no citation_pdf_url meta tag found, try other fallbacks
        if "elife" in doi.lower():
            logger.info(
                "DOI contains eLife, attempting fallback to eLife XML repository on GitHub."
            )
            if not FALLBACKS["elife"](doi, output_path):
                logger.warning(
                    f"eLife XML fallback failed for {paper_metadata['doi']}."
                )
        elif (
            api_keys and "ELSEVIER_TDM_API_KEY" in api_keys
        ):  # elsevier journals can be accessed via the Elsevier TDM API (requires API key)
            FALLBACKS["elsevier"](paper_metadata, output_path, api_keys)
        else:
            logger.warning(
                f"Retrieval failed. No citation_pdf_url meta tag found for {url} and no applicable fallback mechanism available."
            )

    if not save_metadata:
        return

    metadata = {}
    # Extract title
    title_tag = soup.find("meta", {"name": "citation_title"})
    metadata["title"] = title_tag.get("content") if title_tag else "Title not found"

    # Extract authors
    authors = []
    for author_tag in soup.find_all("meta", {"name": "citation_author"}):
        if author_tag.get("content"):
            authors.append(author_tag["content"])
    metadata["authors"] = authors if authors else ["Author information not found"]

    # Extract abstract
    domain = tldextract.extract(url).domain
    abstract_keys = ABSTRACT_ATTRIBUTE.get(domain, DEFAULT_ATTRIBUTES)

    for key in abstract_keys:
        abstract_tag = soup.find("meta", {"name": key})
        if abstract_tag:
            raw_abstract = BeautifulSoup(
                abstract_tag.get("content", "None"), "html.parser"
            ).get_text(separator="\n")
            if raw_abstract.strip().startswith("Abstract"):
                raw_abstract = raw_abstract.strip()[8:]
            metadata["abstract"] = raw_abstract.strip()
            break

    if "abstract" not in metadata.keys():
        metadata["abstract"] = "Abstract not found"
        logger.warning(f"Could not find abstract for {url}")
    elif metadata["abstract"].endswith("..."):
        logger.warning(f"Abstract truncated from {url}")

    # Save metadata to JSON
    try:
        with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
            json.dump(metadata, f, ensure_ascii=False, indent=4)
    except Exception as e:
        logger.error(f"Failed to save metadata to {str(output_path)}: {e}")

save_pdf_from_dump(dump_path: str, pdf_path: str, key_to_save: str = 'doi', save_metadata: bool = False, api_keys: Optional[str] = None) -> None

Receives a path to a .jsonl dump with paper metadata and saves the PDF files of each paper.

Parameters:

Name Type Description Default
dump_path str

Path to a .jsonl file with paper metadata, one paper per line.

required
pdf_path str

Path to a folder where the files will be stored.

required
key_to_save str

Key in the paper metadata to use as filename. Has to be doi or title. Defaults to doi.

'doi'
save_metadata bool

A boolean indicating whether to save paper metadata as a separate json.

False
api_keys Optional[str]

Path to a file with API keys. If None, API-based fallbacks will be skipped.

None
Source code in paperscraper/pdf/pdf.py
def save_pdf_from_dump(
    dump_path: str,
    pdf_path: str,
    key_to_save: str = "doi",
    save_metadata: bool = False,
    api_keys: Optional[str] = None,
) -> None:
    """
    Receives a path to a `.jsonl` dump with paper metadata and saves the PDF files of
    each paper.

    Args:
        dump_path: Path to a `.jsonl` file with paper metadata, one paper per line.
        pdf_path: Path to a folder where the files will be stored.
        key_to_save: Key in the paper metadata to use as filename.
            Has to be `doi` or `title`. Defaults to `doi`.
        save_metadata: A boolean indicating whether to save paper metadata as a separate json.
        api_keys: Path to a file with API keys. If None, API-based fallbacks will be skipped.
    """

    if not isinstance(dump_path, str):
        raise TypeError(f"dump_path must be a string, not {type(dump_path)}.")
    if not dump_path.endswith(".jsonl"):
        raise ValueError("Please provide a dump_path with .jsonl extension.")

    if not isinstance(pdf_path, str):
        raise TypeError(f"pdf_path must be a string, not {type(pdf_path)}.")

    if not isinstance(key_to_save, str):
        raise TypeError(f"key_to_save must be a string, not {type(key_to_save)}.")
    if key_to_save not in ["doi", "title", "date"]:
        raise ValueError("key_to_save must be one of 'doi' or 'title'.")

    papers = load_jsonl(dump_path)

    if not isinstance(api_keys, dict):
        api_keys = load_api_keys(api_keys)

    pbar = tqdm(papers, total=len(papers), desc="Processing")
    for i, paper in enumerate(pbar):
        pbar.set_description(f"Processing paper {i + 1}/{len(papers)}")

        if "doi" not in paper.keys() or paper["doi"] is None:
            logger.warning(f"Skipping {paper['title']} since no DOI available.")
            continue
        filename = paper[key_to_save].replace("/", "_")
        pdf_file = Path(os.path.join(pdf_path, f"{filename}.pdf"))
        xml_file = pdf_file.with_suffix(".xml")
        if pdf_file.exists():
            logger.info(f"File {pdf_file} already exists. Skipping download.")
            continue
        if xml_file.exists():
            logger.info(f"File {xml_file} already exists. Skipping download.")
            continue
        output_path = str(pdf_file)
        save_pdf(paper, output_path, save_metadata=save_metadata, api_keys=api_keys)

utils

load_api_keys(filepath: Optional[str] = None) -> Dict[str, str]

Reads API keys from a file and returns them as a dictionary. The file should have each API key on a separate line in the format: KEY_NAME=API_KEY_VALUE

Example

WILEY_TDM_API_TOKEN=your_wiley_token_here ELSEVIER_TDM_API_KEY=your_elsevier_key_here

Parameters:

Name Type Description Default
filepath Optional[str]

Optional path to the file containing API keys.

None

Returns:

Type Description
Dict[str, str]

Dict[str, str]: A dictionary where keys are API key names and values are their respective API keys.

Source code in paperscraper/pdf/utils.py
def load_api_keys(filepath: Optional[str] = None) -> Dict[str, str]:
    """
    Reads API keys from a file and returns them as a dictionary.
    The file should have each API key on a separate line in the format:
        KEY_NAME=API_KEY_VALUE

    Example:
        WILEY_TDM_API_TOKEN=your_wiley_token_here
        ELSEVIER_TDM_API_KEY=your_elsevier_key_here

    Args:
        filepath: Optional path to the file containing API keys.

    Returns:
        Dict[str, str]: A dictionary where keys are API key names and values are their respective API keys.
    """
    if filepath:
        load_dotenv(dotenv_path=filepath)
    else:
        load_dotenv(find_dotenv())

    return {
        "WILEY_TDM_API_TOKEN": os.getenv("WILEY_TDM_API_TOKEN"),
        "ELSEVIER_TDM_API_KEY": os.getenv("ELSEVIER_TDM_API_KEY"),
        "AWS_ACCESS_KEY_ID": os.getenv("AWS_ACCESS_KEY_ID"),
        "AWS_SECRET_ACCESS_KEY": os.getenv("AWS_SECRET_ACCESS_KEY"),
    }