paperscraper.scholar

`paperscraper.scholar` ¶

`get_citations_from_title(title: str) -> int` ¶

Parameters:

Name	Type	Description	Default
`title`	`str`	Title of paper to be searched on Scholar.	required

Raises:

Type	Description
`TypeError`	If sth else than str is passed.

Returns:

Name	Type	Description
`int`	`int`	Number of citations of paper.

Source code in paperscraper/citations/citations.py

def get_citations_from_title(title: str) -> int:
    """
    Args:
        title (str): Title of paper to be searched on Scholar.

    Raises:
        TypeError: If sth else than str is passed.

    Returns:
        int: Number of citations of paper.
    """

    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    # Search for exact match
    title = '"' + title.strip() + '"'

    matches = scholarly.search_pubs(title)
    counts = list(map(lambda p: int(p["num_citations"]), matches))
    if len(counts) == 0:
        logger.warning(f"Found no match for {title}.")
        return 0
    if len(counts) > 1:
        logger.warning(f"Found {len(counts)} matches for {title}, returning first one.")
    return counts[0]

`dump_papers(papers: pd.DataFrame, filepath: str) -> None` ¶

Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl file with one paper per line.

Parameters:

Name	Type	Description	Default
`papers`	`DataFrame`	A dataframe of paper metadata, one paper per row.	required
`filepath`	`str`	Path to dump the papers, has to end with `.jsonl`.	required

Source code in paperscraper/utils.py

def dump_papers(papers: pd.DataFrame, filepath: str) -> None:
    """
    Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl
    file with one paper per line.

    Args:
        papers (pd.DataFrame): A dataframe of paper metadata, one paper per row.
        filepath (str): Path to dump the papers, has to end with `.jsonl`.
    """
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}")
    if not filepath.endswith(".jsonl"):
        raise ValueError("Please provide a filepath with .jsonl extension")

    if isinstance(papers, List) and all([isinstance(p, Dict) for p in papers]):
        papers = pd.DataFrame(papers)
        logger.warning(
            "Preferably pass a pd.DataFrame, not a list of dictionaries. "
            "Passing a list is a legacy functionality that might become deprecated."
        )

    if not isinstance(papers, pd.DataFrame):
        raise TypeError(f"papers must be a pd.DataFrame, not {type(papers)}")

    paper_list = list(papers.T.to_dict().values())

    with open(filepath, "w") as f:
        for paper in paper_list:
            f.write(json.dumps(paper) + "\n")

`get_scholar_papers(title: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations'], *args, **kwargs) -> pd.DataFrame` ¶

Performs Google Scholar API request of a given title and returns list of papers with fields as desired.

Parameters:

Name	Type	Description	Default
`title`	`str`	Query to arxiv API. Needs to match the arxiv API notation.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'year', 'abstract', 'journal', 'citations']`

Returns:

Type	Description
`DataFrame`	pd.DataFrame. One paper per row.

Source code in paperscraper/scholar/scholar.py

def get_scholar_papers(
    title: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
    *args,
    **kwargs,
) -> pd.DataFrame:
    """
    Performs Google Scholar API request of a given title and returns list of papers with
    fields as desired.

    Args:
        title: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.

    Returns:
        pd.DataFrame. One paper per row.

    """
    logger.info(
        "NOTE: Scholar API cannot be used with Boolean logic in keywords."
        "Query should be a single string to be entered in the Scholar search field."
    )
    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    matches = scholarly.search_pubs(title)

    processed = []
    for paper in matches:
        # Extracts title, author, year, journal, abstract
        entry = {
            scholar_field_mapper.get(key, key): process_fields.get(
                scholar_field_mapper.get(key, key), lambda x: x
            )(value)
            for key, value in paper["bib"].items()
            if scholar_field_mapper.get(key, key) in fields
        }

        entry["citations"] = paper["num_citations"]
        processed.append(entry)

    return pd.DataFrame(processed)

`get_and_dump_scholar_papers(title: str, output_filepath: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations']) -> None` ¶

Combines get_scholar_papers and dump_papers.

Parameters:

Name	Type	Description	Default
`title`	`str`	Paper to search for on Google Scholar.	required
`output_filepath`	`str`	Path where the dump will be saved.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'year', 'abstract', 'journal', 'citations']`

Source code in paperscraper/scholar/scholar.py

def get_and_dump_scholar_papers(
    title: str,
    output_filepath: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
) -> None:
    """
    Combines get_scholar_papers and dump_papers.

    Args:
        title: Paper to search for on Google Scholar.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
    """
    papers = get_scholar_papers(title, fields)
    dump_papers(papers, output_filepath)

`scholar` ¶

`get_scholar_papers(title: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations'], *args, **kwargs) -> pd.DataFrame` ¶

Performs Google Scholar API request of a given title and returns list of papers with fields as desired.

Parameters:

Name	Type	Description	Default
`title`	`str`	Query to arxiv API. Needs to match the arxiv API notation.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'year', 'abstract', 'journal', 'citations']`

Returns:

Type	Description
`DataFrame`	pd.DataFrame. One paper per row.

Source code in paperscraper/scholar/scholar.py

def get_scholar_papers(
    title: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
    *args,
    **kwargs,
) -> pd.DataFrame:
    """
    Performs Google Scholar API request of a given title and returns list of papers with
    fields as desired.

    Args:
        title: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.

    Returns:
        pd.DataFrame. One paper per row.

    """
    logger.info(
        "NOTE: Scholar API cannot be used with Boolean logic in keywords."
        "Query should be a single string to be entered in the Scholar search field."
    )
    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    matches = scholarly.search_pubs(title)

    processed = []
    for paper in matches:
        # Extracts title, author, year, journal, abstract
        entry = {
            scholar_field_mapper.get(key, key): process_fields.get(
                scholar_field_mapper.get(key, key), lambda x: x
            )(value)
            for key, value in paper["bib"].items()
            if scholar_field_mapper.get(key, key) in fields
        }

        entry["citations"] = paper["num_citations"]
        processed.append(entry)

    return pd.DataFrame(processed)

`get_and_dump_scholar_papers(title: str, output_filepath: str, fields: List = ['title', 'authors', 'year', 'abstract', 'journal', 'citations']) -> None` ¶

Combines get_scholar_papers and dump_papers.

Parameters:

Name	Type	Description	Default
`title`	`str`	Paper to search for on Google Scholar.	required
`output_filepath`	`str`	Path where the dump will be saved.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'year', 'abstract', 'journal', 'citations']`

Source code in paperscraper/scholar/scholar.py

def get_and_dump_scholar_papers(
    title: str,
    output_filepath: str,
    fields: List = ["title", "authors", "year", "abstract", "journal", "citations"],
) -> None:
    """
    Combines get_scholar_papers and dump_papers.

    Args:
        title: Paper to search for on Google Scholar.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
    """
    papers = get_scholar_papers(title, fields)
    dump_papers(papers, output_filepath)

paperscraper.scholar