Skip to content

paperscraper.pubmed

paperscraper.pubmed

dump_papers(papers: pd.DataFrame, filepath: str) -> None

Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl file with one paper per line.

Parameters:

Name Type Description Default
papers DataFrame

A dataframe of paper metadata, one paper per row.

required
filepath str

Path to dump the papers, has to end with .jsonl.

required
Source code in paperscraper/utils.py
def dump_papers(papers: pd.DataFrame, filepath: str) -> None:
    """
    Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl
    file with one paper per line.

    Args:
        papers (pd.DataFrame): A dataframe of paper metadata, one paper per row.
        filepath (str): Path to dump the papers, has to end with `.jsonl`.
    """
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}")
    if not filepath.endswith(".jsonl"):
        raise ValueError("Please provide a filepath with .jsonl extension")

    if isinstance(papers, List) and all([isinstance(p, Dict) for p in papers]):
        papers = pd.DataFrame(papers)
        logger.warning(
            "Preferably pass a pd.DataFrame, not a list of dictionaries. "
            "Passing a list is a legacy functionality that might become deprecated."
        )

    if not isinstance(papers, pd.DataFrame):
        raise TypeError(f"papers must be a pd.DataFrame, not {type(papers)}")

    paper_list = list(papers.T.to_dict().values())

    with open(filepath, "w") as f:
        for paper in paper_list:
            f.write(json.dumps(paper) + "\n")

get_emails(paper: PubMedArticle) -> List

Extracts author email addresses from PubMedArticle.

Parameters:

Name Type Description Default
paper PubMedArticle

An object of type PubMedArticle. Requires to have an 'author' field.

required

Returns:

Name Type Description
List List

A possibly empty list of emails associated to authors of the paper.

Source code in paperscraper/pubmed/utils.py
def get_emails(paper: PubMedArticle) -> List:
    """
    Extracts author email addresses from PubMedArticle.

    Args:
        paper (PubMedArticle): An object of type PubMedArticle. Requires to have
            an 'author' field.

    Returns:
        List: A possibly empty list of emails associated to authors of the paper.
    """

    emails = []
    for author in paper.authors:
        for v in author.values():
            if v is not None and "@" in v:
                parts = v.split("@")
                if len(parts) == 2:
                    # Found one email address
                    prefix = parts[0].split(" ")[-1]
                    postfix = parts[1]
                    mail = prefix + "@" + postfix
                    if not (postfix.endswith(".") or postfix.endswith(" ")):
                        emails.append(mail)
                    else:
                        emails.append(mail[:-1])
                else:
                    # Found multiple addresses
                    for idx, part in enumerate(parts):
                        try:
                            if idx == 0:
                                prefix = part.split(" ")[-1]
                            else:
                                postfix = part.split("\n")[0]

                                if postfix.endswith("."):
                                    postfix = postfix[:-1]
                                    mail = prefix + "@" + postfix
                                else:
                                    current_postfix = postfix.split(" ")[0]
                                    mail = prefix + "@" + current_postfix
                                    prefix = postfix.split(" ")[1]
                                emails.append(mail)
                        except IndexError:
                            warnings.warn(f"Mail could not be inferred from {part}.")

    return list(set(emails))

get_query_from_keywords_and_date(keywords: List[Union[str, List]], start_date: str = 'None', end_date: str = 'None') -> str

Receives a list of keywords and returns the query for the pubmed API.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
If start_date and end_date are left as default, the function is

identical to get_query_from_keywords.

Returns:

Name Type Description
str str

query to enter to pubmed API.

Source code in paperscraper/pubmed/utils.py
def get_query_from_keywords_and_date(
    keywords: List[Union[str, List]], start_date: str = "None", end_date: str = "None"
) -> str:
    """Receives a list of keywords and returns the query for the pubmed API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Note: If start_date and end_date are left as default, the function is
        identical to get_query_from_keywords.

    Returns:
        str: query to enter to pubmed API.
    """

    query = get_query_from_keywords(keywords)

    if start_date != "None" and end_date != "None":
        date = date_root.format(start_date, end_date)
    elif start_date != "None" and end_date == "None":
        date = date_root.format(start_date, "3000")
    elif start_date == "None" and end_date != "None":
        date = date_root.format("1000", end_date)
    else:
        return query

    return query + " AND " + date

get_pubmed_papers(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 9998, *args, **kwargs) -> pd.DataFrame

Performs PubMed API request of a query and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
query str

Query to PubMed API. Needs to match PubMed API notation.

required
fields List

List of strings with fields to keep in output. NOTE: If 'emails' is passed, an attempt is made to extract author mail addresses.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
max_results int

Maximal number of results retrieved from DB. Defaults to 9998, higher values likely raise problems due to PubMedAPI, see: https://stackoverflow.com/questions/75353091/biopython-entrez-article-limit

9998
args

additional arguments for pubmed.query

()
kwargs

additional arguments for pubmed.query

{}

Returns:

Type Description
DataFrame

pd.DataFrame. One paper per row.

Source code in paperscraper/pubmed/pubmed.py
def get_pubmed_papers(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 9998,
    *args,
    **kwargs,
) -> pd.DataFrame:
    """
    Performs PubMed API request of a query and returns list of papers with
    fields as desired.

    Args:
        query: Query to PubMed API. Needs to match PubMed API notation.
        fields: List of strings with fields to keep in output.
            NOTE: If 'emails' is passed, an attempt is made to extract author mail
            addresses.
        max_results: Maximal number of results retrieved from DB. Defaults
            to 9998, higher values likely raise problems due to PubMedAPI, see:
            https://stackoverflow.com/questions/75353091/biopython-entrez-article-limit
        args: additional arguments for pubmed.query
        kwargs: additional arguments for pubmed.query

    Returns:
        pd.DataFrame. One paper per row.

    """
    if max_results > 9998:
        logger.warning(
            f"\nmax_results cannot be larger than 9998, received {max_results}."
            "This will likely result in a JSONDecodeError. Considering lowering `max_results`.\n"
            "For PubMed, ESearch can only retrieve the first 9,999 records matching the query. "
            "To obtain more than 9,999 PubMed records, consider using EDirect that contains additional"
            "logic to batch PubMed search results automatically so that an arbitrary number can be retrieved"
        )
    raw = list(PUBMED.query(query, max_results=max_results, *args, **kwargs))

    get_mails = "emails" in fields
    if get_mails:
        fields.pop(fields.index("emails"))

    processed = [
        {
            pubmed_field_mapper.get(key, key): process_fields.get(
                pubmed_field_mapper.get(key, key), lambda x: x
            )(value)
            for key, value in paper.toDict().items()
            if pubmed_field_mapper.get(key, key) in fields
        }
        for paper in raw
    ]
    if get_mails:
        for idx, paper in enumerate(raw):
            processed[idx].update({"emails": get_emails(paper)})

    return pd.DataFrame(processed)

get_and_dump_pubmed_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', *args, **kwargs) -> None

Combines get_pubmed_papers and dump_papers.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

List of keywords to request pubmed API. The outer list level will be considered as AND separated keys. The inner level as OR separated.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi']. NOTE: If 'emails' is passed, an attempt is made to extract author mail addresses.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
Source code in paperscraper/pubmed/pubmed.py
def get_and_dump_pubmed_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    *args,
    **kwargs,
) -> None:
    """
    Combines get_pubmed_papers and dump_papers.

    Args:
        keywords: List of keywords to request pubmed API.
            The outer list level will be considered as AND separated keys.
            The inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
            NOTE: If 'emails' is passed, an attempt is made to extract author mail
            addresses.
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
    """
    # Translate keywords into query.
    query = get_query_from_keywords_and_date(
        keywords, start_date=start_date, end_date=end_date
    )
    papers = get_pubmed_papers(query, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

pubmed

get_pubmed_papers(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 9998, *args, **kwargs) -> pd.DataFrame

Performs PubMed API request of a query and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
query str

Query to PubMed API. Needs to match PubMed API notation.

required
fields List

List of strings with fields to keep in output. NOTE: If 'emails' is passed, an attempt is made to extract author mail addresses.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
max_results int

Maximal number of results retrieved from DB. Defaults to 9998, higher values likely raise problems due to PubMedAPI, see: https://stackoverflow.com/questions/75353091/biopython-entrez-article-limit

9998
args

additional arguments for pubmed.query

()
kwargs

additional arguments for pubmed.query

{}

Returns:

Type Description
DataFrame

pd.DataFrame. One paper per row.

Source code in paperscraper/pubmed/pubmed.py
def get_pubmed_papers(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 9998,
    *args,
    **kwargs,
) -> pd.DataFrame:
    """
    Performs PubMed API request of a query and returns list of papers with
    fields as desired.

    Args:
        query: Query to PubMed API. Needs to match PubMed API notation.
        fields: List of strings with fields to keep in output.
            NOTE: If 'emails' is passed, an attempt is made to extract author mail
            addresses.
        max_results: Maximal number of results retrieved from DB. Defaults
            to 9998, higher values likely raise problems due to PubMedAPI, see:
            https://stackoverflow.com/questions/75353091/biopython-entrez-article-limit
        args: additional arguments for pubmed.query
        kwargs: additional arguments for pubmed.query

    Returns:
        pd.DataFrame. One paper per row.

    """
    if max_results > 9998:
        logger.warning(
            f"\nmax_results cannot be larger than 9998, received {max_results}."
            "This will likely result in a JSONDecodeError. Considering lowering `max_results`.\n"
            "For PubMed, ESearch can only retrieve the first 9,999 records matching the query. "
            "To obtain more than 9,999 PubMed records, consider using EDirect that contains additional"
            "logic to batch PubMed search results automatically so that an arbitrary number can be retrieved"
        )
    raw = list(PUBMED.query(query, max_results=max_results, *args, **kwargs))

    get_mails = "emails" in fields
    if get_mails:
        fields.pop(fields.index("emails"))

    processed = [
        {
            pubmed_field_mapper.get(key, key): process_fields.get(
                pubmed_field_mapper.get(key, key), lambda x: x
            )(value)
            for key, value in paper.toDict().items()
            if pubmed_field_mapper.get(key, key) in fields
        }
        for paper in raw
    ]
    if get_mails:
        for idx, paper in enumerate(raw):
            processed[idx].update({"emails": get_emails(paper)})

    return pd.DataFrame(processed)

get_and_dump_pubmed_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', *args, **kwargs) -> None

Combines get_pubmed_papers and dump_papers.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

List of keywords to request pubmed API. The outer list level will be considered as AND separated keys. The inner level as OR separated.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi']. NOTE: If 'emails' is passed, an attempt is made to extract author mail addresses.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
Source code in paperscraper/pubmed/pubmed.py
def get_and_dump_pubmed_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    *args,
    **kwargs,
) -> None:
    """
    Combines get_pubmed_papers and dump_papers.

    Args:
        keywords: List of keywords to request pubmed API.
            The outer list level will be considered as AND separated keys.
            The inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
            NOTE: If 'emails' is passed, an attempt is made to extract author mail
            addresses.
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
    """
    # Translate keywords into query.
    query = get_query_from_keywords_and_date(
        keywords, start_date=start_date, end_date=end_date
    )
    papers = get_pubmed_papers(query, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

utils

get_query_from_keywords(keywords: List[Union[str, List]]) -> str

Receives a list of keywords and returns the query for the pubmed API.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required

Returns:

Name Type Description
str str

query to enter to pubmed API.

Source code in paperscraper/pubmed/utils.py
def get_query_from_keywords(keywords: List[Union[str, List]]) -> str:
    """Receives a list of keywords and returns the query for the pubmed API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.

    Returns:
        str: query to enter to pubmed API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"({key}) AND "
        elif isinstance(key, list):
            inter = "".join([f"({syn}) OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    return query

get_query_from_keywords_and_date(keywords: List[Union[str, List]], start_date: str = 'None', end_date: str = 'None') -> str

Receives a list of keywords and returns the query for the pubmed API.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
If start_date and end_date are left as default, the function is

identical to get_query_from_keywords.

Returns:

Name Type Description
str str

query to enter to pubmed API.

Source code in paperscraper/pubmed/utils.py
def get_query_from_keywords_and_date(
    keywords: List[Union[str, List]], start_date: str = "None", end_date: str = "None"
) -> str:
    """Receives a list of keywords and returns the query for the pubmed API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Note: If start_date and end_date are left as default, the function is
        identical to get_query_from_keywords.

    Returns:
        str: query to enter to pubmed API.
    """

    query = get_query_from_keywords(keywords)

    if start_date != "None" and end_date != "None":
        date = date_root.format(start_date, end_date)
    elif start_date != "None" and end_date == "None":
        date = date_root.format(start_date, "3000")
    elif start_date == "None" and end_date != "None":
        date = date_root.format("1000", end_date)
    else:
        return query

    return query + " AND " + date

get_emails(paper: PubMedArticle) -> List

Extracts author email addresses from PubMedArticle.

Parameters:

Name Type Description Default
paper PubMedArticle

An object of type PubMedArticle. Requires to have an 'author' field.

required

Returns:

Name Type Description
List List

A possibly empty list of emails associated to authors of the paper.

Source code in paperscraper/pubmed/utils.py
def get_emails(paper: PubMedArticle) -> List:
    """
    Extracts author email addresses from PubMedArticle.

    Args:
        paper (PubMedArticle): An object of type PubMedArticle. Requires to have
            an 'author' field.

    Returns:
        List: A possibly empty list of emails associated to authors of the paper.
    """

    emails = []
    for author in paper.authors:
        for v in author.values():
            if v is not None and "@" in v:
                parts = v.split("@")
                if len(parts) == 2:
                    # Found one email address
                    prefix = parts[0].split(" ")[-1]
                    postfix = parts[1]
                    mail = prefix + "@" + postfix
                    if not (postfix.endswith(".") or postfix.endswith(" ")):
                        emails.append(mail)
                    else:
                        emails.append(mail[:-1])
                else:
                    # Found multiple addresses
                    for idx, part in enumerate(parts):
                        try:
                            if idx == 0:
                                prefix = part.split(" ")[-1]
                            else:
                                postfix = part.split("\n")[0]

                                if postfix.endswith("."):
                                    postfix = postfix[:-1]
                                    mail = prefix + "@" + postfix
                                else:
                                    current_postfix = postfix.split(" ")[0]
                                    mail = prefix + "@" + current_postfix
                                    prefix = postfix.split(" ")[1]
                                emails.append(mail)
                        except IndexError:
                            warnings.warn(f"Mail could not be inferred from {part}.")

    return list(set(emails))