Skip to content

paperscraper.scholar

paperscraper.scholar

get_citations_from_title(title: str) -> int

Parameters:

Name Type Description Default
title str

Title of paper to be searched on Scholar.

required

Raises:

Type Description
TypeError

If sth else than str is passed.

Returns:

Name Type Description
int int

Number of citations of paper.

Source code in paperscraper/citations/citations.py
def get_citations_from_title(title: str) -> int:
    """
    Args:
        title (str): Title of paper to be searched on Scholar.

    Raises:
        TypeError: If sth else than str is passed.

    Returns:
        int: Number of citations of paper.
    """

    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    # Search for exact match
    title = '"' + title.strip() + '"'

    matches = scholarly.search_pubs(title)
    counts = list(map(lambda p: int(p["num_citations"]), matches))
    if len(counts) == 0:
        logger.warning(f"Found no match for {title}.")
        return 0
    if len(counts) > 1:
        logger.warning(f"Found {len(counts)} matches for {title}, returning first one.")
    return counts[0]

dump_papers(papers: pd.DataFrame, filepath: str) -> None

Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl file with one paper per line.

Parameters:

Name Type Description Default
papers DataFrame

A dataframe of paper metadata, one paper per row.

required
filepath str

Path to dump the papers, has to end with .jsonl.

required
Source code in paperscraper/utils.py
def dump_papers(papers: pd.DataFrame, filepath: str) -> None:
    """
    Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl
    file with one paper per line.

    Args:
        papers (pd.DataFrame): A dataframe of paper metadata, one paper per row.
        filepath (str): Path to dump the papers, has to end with `.jsonl`.
    """
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}")
    if not filepath.endswith(".jsonl"):
        raise ValueError("Please provide a filepath with .jsonl extension")

    if isinstance(papers, List) and all([isinstance(p, Dict) for p in papers]):
        papers = pd.DataFrame(papers)
        logger.warning(
            "Preferably pass a pd.DataFrame, not a list of dictionaries. "
            "Passing a list is a legacy functionality that might become deprecated."
        )

    if not isinstance(papers, pd.DataFrame):
        raise TypeError(f"papers must be a pd.DataFrame, not {type(papers)}")

    paper_list = list(papers.T.to_dict().values())

    with open(filepath, "w") as f:
        for paper in paper_list:
            f.write(json.dumps(paper) + "\n")

get_scholar_papers(title: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations'], *args, **kwargs) -> pd.DataFrame

Performs Google Scholar API request of a given title and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
title str

Query to arxiv API. Needs to match the arxiv API notation.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'year', 'abstract', 'journal', 'citations']

Returns:

Type Description
DataFrame

pd.DataFrame. One paper per row.

Source code in paperscraper/scholar/scholar.py
def get_scholar_papers(
    title: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
    *args,
    **kwargs,
) -> pd.DataFrame:
    """
    Performs Google Scholar API request of a given title and returns list of papers with
    fields as desired.

    Args:
        title: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.

    Returns:
        pd.DataFrame. One paper per row.

    """
    logger.info(
        "NOTE: Scholar API cannot be used with Boolean logic in keywords."
        "Query should be a single string to be entered in the Scholar search field."
    )
    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    matches = scholarly.search_pubs(title)

    processed = []
    for paper in matches:
        # Extracts title, author, year, journal, abstract
        entry = {
            scholar_field_mapper.get(key, key): process_fields.get(
                scholar_field_mapper.get(key, key), lambda x: x
            )(value)
            for key, value in paper["bib"].items()
            if scholar_field_mapper.get(key, key) in fields
        }

        entry["citations"] = paper["num_citations"]
        processed.append(entry)

    return pd.DataFrame(processed)

get_and_dump_scholar_papers(title: str, output_filepath: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations']) -> None

Combines get_scholar_papers and dump_papers.

Parameters:

Name Type Description Default
title str

Paper to search for on Google Scholar.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'year', 'abstract', 'journal', 'citations']
Source code in paperscraper/scholar/scholar.py
def get_and_dump_scholar_papers(
    title: str,
    output_filepath: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
) -> None:
    """
    Combines get_scholar_papers and dump_papers.

    Args:
        title: Paper to search for on Google Scholar.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
    """
    papers = get_scholar_papers(title, fields)
    dump_papers(papers, output_filepath)

scholar

get_scholar_papers(title: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations'], *args, **kwargs) -> pd.DataFrame

Performs Google Scholar API request of a given title and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
title str

Query to arxiv API. Needs to match the arxiv API notation.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'year', 'abstract', 'journal', 'citations']

Returns:

Type Description
DataFrame

pd.DataFrame. One paper per row.

Source code in paperscraper/scholar/scholar.py
def get_scholar_papers(
    title: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
    *args,
    **kwargs,
) -> pd.DataFrame:
    """
    Performs Google Scholar API request of a given title and returns list of papers with
    fields as desired.

    Args:
        title: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.

    Returns:
        pd.DataFrame. One paper per row.

    """
    logger.info(
        "NOTE: Scholar API cannot be used with Boolean logic in keywords."
        "Query should be a single string to be entered in the Scholar search field."
    )
    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    matches = scholarly.search_pubs(title)

    processed = []
    for paper in matches:
        # Extracts title, author, year, journal, abstract
        entry = {
            scholar_field_mapper.get(key, key): process_fields.get(
                scholar_field_mapper.get(key, key), lambda x: x
            )(value)
            for key, value in paper["bib"].items()
            if scholar_field_mapper.get(key, key) in fields
        }

        entry["citations"] = paper["num_citations"]
        processed.append(entry)

    return pd.DataFrame(processed)

get_and_dump_scholar_papers(title: str, output_filepath: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations']) -> None

Combines get_scholar_papers and dump_papers.

Parameters:

Name Type Description Default
title str

Paper to search for on Google Scholar.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'year', 'abstract', 'journal', 'citations']
Source code in paperscraper/scholar/scholar.py
def get_and_dump_scholar_papers(
    title: str,
    output_filepath: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
) -> None:
    """
    Combines get_scholar_papers and dump_papers.

    Args:
        title: Paper to search for on Google Scholar.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
    """
    papers = get_scholar_papers(title, fields)
    dump_papers(papers, output_filepath)