Skip to content

paperscraper.citations

paperscraper.citations

citations

get_citations_by_doi(doi: str) -> int

Get the number of citations of a paper according to semantic scholar.

Parameters:

Name Type Description Default
doi str

the DOI of the paper.

required

Returns:

Type Description
int

The number of citations

Source code in paperscraper/citations/citations.py
def get_citations_by_doi(doi: str) -> int:
    """
    Get the number of citations of a paper according to semantic scholar.

    Args:
        doi: the DOI of the paper.

    Returns:
        The number of citations
    """

    try:
        paper = sch.get_paper(doi)
        citations = len(paper["citations"])
    except SemanticScholarException.ObjectNotFoundException:
        logger.warning(f"Could not find paper {doi}, assuming 0 citation.")
        citations = 0
    except ConnectionRefusedError as e:
        logger.warning(f"Waiting for 10 sec since {doi} gave: {e}")
        sleep(10)
        citations = len(sch.get_paper(doi)["citations"])
    finally:
        return citations

get_citations_from_title(title: str) -> int

Parameters:

Name Type Description Default
title str

Title of paper to be searched on Scholar.

required

Raises:

Type Description
TypeError

If sth else than str is passed.

Returns:

Name Type Description
int int

Number of citations of paper.

Source code in paperscraper/citations/citations.py
def get_citations_from_title(title: str) -> int:
    """
    Args:
        title (str): Title of paper to be searched on Scholar.

    Raises:
        TypeError: If sth else than str is passed.

    Returns:
        int: Number of citations of paper.
    """

    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    # Search for exact match
    title = '"' + title.strip() + '"'

    matches = scholarly.search_pubs(title)
    counts = list(map(lambda p: int(p["num_citations"]), matches))
    if len(counts) == 0:
        logger.warning(f"Found no match for {title}.")
        return 0
    if len(counts) > 1:
        logger.warning(f"Found {len(counts)} matches for {title}, returning first one.")
    return counts[0]

entity

core

Entity

An abstract entity class with a set of utilities shared by the objects that perform self-linking analyses, such as Paper and Researcher.

Source code in paperscraper/citations/entity/core.py
class Entity:
    """
    An abstract entity class with a set of utilities shared by the objects that perform
    self-linking analyses, such as Paper and Researcher.
    """

    @abstractmethod
    def self_references(self):
        """
        Has to be implemented by the child class. Performs a self-referencing analyses
        for the object.
        """
        ...

    @abstractmethod
    def self_citations(self):
        """
        Has to be implemented by the child class. Performs a self-citation analyses
        for the object.
        """
        ...

    @abstractmethod
    def get_result(self):
        """
        Has to be implemented by the child class. Provides the result of the analysis.
        """
        ...
self_references() abstractmethod

Has to be implemented by the child class. Performs a self-referencing analyses for the object.

Source code in paperscraper/citations/entity/core.py
@abstractmethod
def self_references(self):
    """
    Has to be implemented by the child class. Performs a self-referencing analyses
    for the object.
    """
    ...
self_citations() abstractmethod

Has to be implemented by the child class. Performs a self-citation analyses for the object.

Source code in paperscraper/citations/entity/core.py
@abstractmethod
def self_citations(self):
    """
    Has to be implemented by the child class. Performs a self-citation analyses
    for the object.
    """
    ...
get_result() abstractmethod

Has to be implemented by the child class. Provides the result of the analysis.

Source code in paperscraper/citations/entity/core.py
@abstractmethod
def get_result(self):
    """
    Has to be implemented by the child class. Provides the result of the analysis.
    """
    ...

paper

Paper

Bases: Entity

Source code in paperscraper/citations/entity/paper.py
class Paper(Entity):
    title: str = ""
    doi: str = ""
    authors: List[str] = []

    def __init__(self, input: str, mode: ModeType = "infer"):
        """
        Set up a Paper object for analysis.

        Args:
            input: Paper identifier. This can be the title, DOI or semantic scholar ID
                of the paper.
            mode: The format in which the ID was provided. Defaults to "infer".

        Raises:
            ValueError: If unknown mode is given.
        """
        if mode not in MODES:
            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

        input = input.strip()
        self.input = input
        if mode == "infer":
            mode = determine_paper_input_type(input)

        if mode == "doi":
            self.doi = input
        elif mode == "title":
            self.doi = get_doi_from_title(input)
        elif mode == "ssid":
            self.doi = get_doi_from_ssid(input)

        if self.doi is not None:
            out = get_title_and_id_from_doi(self.doi)
            if out is not None:
                self.title = out["title"]
                self.ssid = out["ssid"]

    def self_references(self):
        """
        Extracts the self references of a paper, for each author.
        """
        if isinstance(self.doi, str):
            self.ref_result: ReferenceResult = self_references_paper(self.doi)

    def self_citations(self):
        """
        Extracts the self citations of a paper, for each author.
        """
        if isinstance(self.doi, str):
            self.citation_result: CitationResult = self_citations_paper(self.doi)

    def get_result(self) -> Optional[PaperResult]:
        """
        Provides the result of the analysis.

        Returns: PaperResult if available.
        """
        if not hasattr(self, "ref_result"):
            logger.warning(
                f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
            )
            return
        elif not hasattr(self, "citation_result"):
            logger.warning(
                f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
            )
            return
        ref_result = self.ref_result.model_dump()
        ref_result.pop("ssid", None)
        return PaperResult(
            title=self.title, **ref_result, **self.citation_result.model_dump()
        )
__init__(input: str, mode: ModeType = 'infer')

Set up a Paper object for analysis.

Parameters:

Name Type Description Default
input str

Paper identifier. This can be the title, DOI or semantic scholar ID of the paper.

required
mode ModeType

The format in which the ID was provided. Defaults to "infer".

'infer'

Raises:

Type Description
ValueError

If unknown mode is given.

Source code in paperscraper/citations/entity/paper.py
def __init__(self, input: str, mode: ModeType = "infer"):
    """
    Set up a Paper object for analysis.

    Args:
        input: Paper identifier. This can be the title, DOI or semantic scholar ID
            of the paper.
        mode: The format in which the ID was provided. Defaults to "infer".

    Raises:
        ValueError: If unknown mode is given.
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

    input = input.strip()
    self.input = input
    if mode == "infer":
        mode = determine_paper_input_type(input)

    if mode == "doi":
        self.doi = input
    elif mode == "title":
        self.doi = get_doi_from_title(input)
    elif mode == "ssid":
        self.doi = get_doi_from_ssid(input)

    if self.doi is not None:
        out = get_title_and_id_from_doi(self.doi)
        if out is not None:
            self.title = out["title"]
            self.ssid = out["ssid"]
self_references()

Extracts the self references of a paper, for each author.

Source code in paperscraper/citations/entity/paper.py
def self_references(self):
    """
    Extracts the self references of a paper, for each author.
    """
    if isinstance(self.doi, str):
        self.ref_result: ReferenceResult = self_references_paper(self.doi)
self_citations()

Extracts the self citations of a paper, for each author.

Source code in paperscraper/citations/entity/paper.py
def self_citations(self):
    """
    Extracts the self citations of a paper, for each author.
    """
    if isinstance(self.doi, str):
        self.citation_result: CitationResult = self_citations_paper(self.doi)
get_result() -> Optional[PaperResult]

Provides the result of the analysis.

Returns: PaperResult if available.

Source code in paperscraper/citations/entity/paper.py
def get_result(self) -> Optional[PaperResult]:
    """
    Provides the result of the analysis.

    Returns: PaperResult if available.
    """
    if not hasattr(self, "ref_result"):
        logger.warning(
            f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
        )
        return
    elif not hasattr(self, "citation_result"):
        logger.warning(
            f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
        )
        return
    ref_result = self.ref_result.model_dump()
    ref_result.pop("ssid", None)
    return PaperResult(
        title=self.title, **ref_result, **self.citation_result.model_dump()
    )

researcher

Researcher

Bases: Entity

Source code in paperscraper/citations/entity/researcher.py
class Researcher(Entity):
    name: str
    ssid: int
    orcid: Optional[str] = None

    def __init__(self, input: str, mode: ModeType = "infer"):
        """
        Construct researcher object for self citation/reference analysis.

        Args:
            input: A researcher to search for.
            mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
                Defaults to "infer".

        Raises:
            ValueError: Unknown mode
        """
        if mode not in MODES:
            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

        input = input.strip()
        if mode == "infer":
            if input.isdigit():
                mode = "ssaid"
            elif (
                input.count("-") == 3
                and len(input) == 19
                and all([x.isdigit() for x in input.split("-")])
            ):
                mode = "orcid"
            else:
                mode = "author"

        if mode == "ssaid":
            self.author = sch.get_author(input)
            self.ssid = input
        elif mode == "orcid":
            self.author = orcid_to_author_name(input)
            self.orcid = input
            self.ssid = author_name_to_ssaid(input)
        elif mode == "author":
            self.author = input
            self.ssid = author_name_to_ssaid(input)

        # TODO: Skip over erratum / corrigendum
        self.ssids = get_papers_for_author(self.ssid)

    def self_references(self):
        """
        Sifts through all papers of a researcher and extracts the self references.
        """
        # TODO: Asynchronous call to self_references
        print("Going through SSIDs", self.ssids)

        # TODO: Aggregate results

    def self_citations(self):
        """
        Sifts through all papers of a researcher and finds how often they are self-cited.
        """
        ...

    def get_result(self) -> ResearcherResult:
        """
        Provides the result of the analysis.
        """
        ...
__init__(input: str, mode: ModeType = 'infer')

Construct researcher object for self citation/reference analysis.

Parameters:

Name Type Description Default
input str

A researcher to search for.

required
mode ModeType

This can be a name orcid (ORCID iD) or ssaid (Semantic Scholar Author ID). Defaults to "infer".

'infer'

Raises:

Type Description
ValueError

Unknown mode

Source code in paperscraper/citations/entity/researcher.py
def __init__(self, input: str, mode: ModeType = "infer"):
    """
    Construct researcher object for self citation/reference analysis.

    Args:
        input: A researcher to search for.
        mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
            Defaults to "infer".

    Raises:
        ValueError: Unknown mode
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

    input = input.strip()
    if mode == "infer":
        if input.isdigit():
            mode = "ssaid"
        elif (
            input.count("-") == 3
            and len(input) == 19
            and all([x.isdigit() for x in input.split("-")])
        ):
            mode = "orcid"
        else:
            mode = "author"

    if mode == "ssaid":
        self.author = sch.get_author(input)
        self.ssid = input
    elif mode == "orcid":
        self.author = orcid_to_author_name(input)
        self.orcid = input
        self.ssid = author_name_to_ssaid(input)
    elif mode == "author":
        self.author = input
        self.ssid = author_name_to_ssaid(input)

    # TODO: Skip over erratum / corrigendum
    self.ssids = get_papers_for_author(self.ssid)
self_references()

Sifts through all papers of a researcher and extracts the self references.

Source code in paperscraper/citations/entity/researcher.py
def self_references(self):
    """
    Sifts through all papers of a researcher and extracts the self references.
    """
    # TODO: Asynchronous call to self_references
    print("Going through SSIDs", self.ssids)
self_citations()

Sifts through all papers of a researcher and finds how often they are self-cited.

Source code in paperscraper/citations/entity/researcher.py
def self_citations(self):
    """
    Sifts through all papers of a researcher and finds how often they are self-cited.
    """
    ...
get_result() -> ResearcherResult

Provides the result of the analysis.

Source code in paperscraper/citations/entity/researcher.py
def get_result(self) -> ResearcherResult:
    """
    Provides the result of the analysis.
    """
    ...

orcid

orcid_to_author_name(orcid_id: str) -> Optional[str]

Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'), returns the full name of the author from the ORCID public API.

Source code in paperscraper/citations/orcid.py
def orcid_to_author_name(orcid_id: str) -> Optional[str]:
    """
    Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'),
    returns the full name of the author from the ORCID public API.
    """

    headers = {"Accept": "application/json"}
    response = requests.get(f"{BASE_URL}{orcid_id}/person", headers=headers)
    if response.status_code == 200:
        data = response.json()
        given = data.get("name", {}).get("given-names", {}).get("value", "")
        family = data.get("name", {}).get("family-name", {}).get("value", "")
        full_name = f"{given} {family}".strip()
        return full_name
    logger.error(
        f"Error fetching ORCID data ({orcid_id}): {response.status_code} {response.text}"
    )

self_citations

self_citations_paper(inputs: Union[str, List[str]], verbose: bool = False) -> Union[CitationResult, List[CitationResult]] async

Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.

Parameters:

Name Type Description Default
inputs Union[str, List[str]]

A single DOI/SSID string or a list of them.

required
verbose bool

If True, logs detailed information for each paper.

False

Returns:

Type Description
Union[CitationResult, List[CitationResult]]

A single CitationResult if a string was passed, else a list of CitationResults.

Source code in paperscraper/citations/self_citations.py
@optional_async
@retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
async def self_citations_paper(
    inputs: Union[str, List[str]], verbose: bool = False
) -> Union[CitationResult, List[CitationResult]]:
    """
    Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.

    Args:
        inputs: A single DOI/SSID string or a list of them.
        verbose: If True, logs detailed information for each paper.

    Returns:
        A single CitationResult if a string was passed, else a list of CitationResults.
    """
    single_input = isinstance(inputs, str)
    identifiers = [inputs] if single_input else list(inputs)

    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        tasks = [_process_single(client, ident) for ident in identifiers]
        results = await asyncio.gather(*tasks)

    if verbose:
        for res in results:
            logger.info(
                f'Self-citations in "{res.ssid}": N={res.num_citations}, Score={res.citation_score}%'
            )
            for author, pct in res.self_citations.items():
                logger.info(f"  {author}: {pct}%")

    return results[0] if single_input else results

self_references

self_references_paper(inputs: Union[str, List[str]], verbose: bool = False) -> Union[ReferenceResult, List[ReferenceResult]] async

Analyze self-references for one or more papers by DOI or Semantic Scholar ID.

Parameters:

Name Type Description Default
inputs Union[str, List[str]]

A single DOI/SSID string or a list of them.

required
verbose bool

If True, logs detailed information for each paper.

False

Returns:

Type Description
Union[ReferenceResult, List[ReferenceResult]]

A single ReferenceResult if a string was passed, else a list of ReferenceResults.

Raises:

Type Description
ValueError

If no references are found for a given identifier.

Source code in paperscraper/citations/self_references.py
@optional_async
@retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
async def self_references_paper(
    inputs: Union[str, List[str]], verbose: bool = False
) -> Union[ReferenceResult, List[ReferenceResult]]:
    """
    Analyze self-references for one or more papers by DOI or Semantic Scholar ID.

    Args:
        inputs: A single DOI/SSID string or a list of them.
        verbose: If True, logs detailed information for each paper.

    Returns:
        A single ReferenceResult if a string was passed, else a list of ReferenceResults.

    Raises:
        ValueError: If no references are found for a given identifier.
    """
    single_input = isinstance(inputs, str)
    identifiers = [inputs] if single_input else list(inputs)

    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        tasks = [_process_single_reference(client, ident) for ident in identifiers]
        results = await asyncio.gather(*tasks)

    if verbose:
        for res in results:
            logger.info(
                f'Self-references in "{res.ssid}": N={res.num_references}, '
                f"Score={res.reference_score}%"
            )
            for author, pct in res.self_references.items():
                logger.info(f"  {author}: {pct}% self-reference")

    return results[0] if single_input else results

tests

test_self_references

TestSelfReferences
Source code in paperscraper/citations/tests/test_self_references.py
class TestSelfReferences:
    @pytest.fixture
    def dois(self):
        return [
            "10.1038/s41586-023-06600-9",
            "10.1016/j.neunet.2014.09.003",
        ]

    def test_single_doi(self, dois):
        result = self_references_paper(dois[0])
        assert isinstance(result, ReferenceResult)
        assert isinstance(result.num_references, int)
        assert result.num_references > 0
        assert isinstance(result.ssid, str)
        assert isinstance(result.reference_score, float)
        assert result.reference_score > 0
        assert isinstance(result.self_references, Dict)
        for author, self_cites in result.self_references.items():
            assert isinstance(author, str)
            assert isinstance(self_cites, float)
            assert self_cites >= 0 and self_cites <= 100

    def test_multiple_dois(self, dois):
        results = self_references_paper(dois[1:])
        assert isinstance(results, list)
        assert len(results) == len(dois[1:])
        for ref_result in results:
            assert isinstance(ref_result, ReferenceResult)
            assert isinstance(ref_result.ssid, str)
            assert isinstance(ref_result.num_references, int)
            assert ref_result.num_references > 0
            assert ref_result.reference_score > 0
            assert isinstance(ref_result.reference_score, float)
            for author, self_cites in ref_result.self_references.items():
                assert isinstance(author, str)
                assert isinstance(self_cites, float)
                assert self_cites >= 0 and self_cites <= 100

    def test_compare_async_and_sync_performance(self, dois):
        """
        Compares the execution time of asynchronous and synchronous `self_references`
        for a list of DOIs.
        """

        start_time = time.perf_counter()
        async_results = self_references_paper(dois)
        async_duration = time.perf_counter() - start_time

        # Measure synchronous execution time (three independent calls)
        start_time = time.perf_counter()
        sync_results = [self_references_paper(doi) for doi in dois]

        sync_duration = time.perf_counter() - start_time

        print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
        print(
            f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
        )
        for a, s in zip(async_results, sync_results):
            assert a == s, f"{a} vs {s}"

        # Assert that async execution (batch) is faster or at least not slower
        assert 0.9 * async_duration <= sync_duration, (
            f"Async execution ({async_duration:.2f}s) is slower than sync execution "
            f"({sync_duration:.2f}s)"
        )
test_compare_async_and_sync_performance(dois)

Compares the execution time of asynchronous and synchronous self_references for a list of DOIs.

Source code in paperscraper/citations/tests/test_self_references.py
def test_compare_async_and_sync_performance(self, dois):
    """
    Compares the execution time of asynchronous and synchronous `self_references`
    for a list of DOIs.
    """

    start_time = time.perf_counter()
    async_results = self_references_paper(dois)
    async_duration = time.perf_counter() - start_time

    # Measure synchronous execution time (three independent calls)
    start_time = time.perf_counter()
    sync_results = [self_references_paper(doi) for doi in dois]

    sync_duration = time.perf_counter() - start_time

    print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
    print(
        f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
    )
    for a, s in zip(async_results, sync_results):
        assert a == s, f"{a} vs {s}"

    # Assert that async execution (batch) is faster or at least not slower
    assert 0.9 * async_duration <= sync_duration, (
        f"Async execution ({async_duration:.2f}s) is slower than sync execution "
        f"({sync_duration:.2f}s)"
    )

utils

get_doi_from_title(title: str) -> Optional[str]

Searches the DOI of a paper based on the paper title

Parameters:

Name Type Description Default
title str

Paper title

required

Returns:

Type Description
Optional[str]

DOI according to semantic scholar API

Source code in paperscraper/citations/utils.py
def get_doi_from_title(title: str) -> Optional[str]:
    """
    Searches the DOI of a paper based on the paper title

    Args:
        title: Paper title

    Returns:
        DOI according to semantic scholar API
    """
    response = requests.get(
        PAPER_URL + "search",
        params={"query": title, "fields": "externalIds", "limit": 1},
    )
    data = response.json()

    if data.get("data"):
        paper = data["data"][0]
        doi = paper.get("externalIds", {}).get("DOI")
        if doi:
            return doi
    logger.warning(f"Did not find DOI for title={title}")

get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]

Given a Semantic Scholar paper ID, returns the corresponding DOI if available.

Parameters:

Name Type Description Default
ssid str

The paper ID on Semantic Scholar.

required

Returns:

Type Description
Optional[str]

str or None: The DOI of the paper, or None if not found or in case of an error.

Source code in paperscraper/citations/utils.py
def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
    """
    Given a Semantic Scholar paper ID, returns the corresponding DOI if available.

    Parameters:
      ssid (str): The paper ID on Semantic Scholar.

    Returns:
      str or None: The DOI of the paper, or None if not found or in case of an error.
    """
    logger.warning(
        "Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
    )
    attempts = 0
    for attempt in tqdm(
        range(1, max_retries + 1), desc=f"Fetching DOI for {ssid}", unit="attempt"
    ):
        # Make the GET request to Semantic Scholar.
        response = requests.get(
            f"{PAPER_URL}{ssid}", params={"fields": "externalIds", "limit": 1}
        )

        # If successful, try to extract and return the DOI.
        if response.status_code == 200:
            data = response.json()
            doi = data.get("externalIds", {}).get("DOI")
            return doi
        attempts += 1
        sleep(10)
    logger.warning(
        f"Did not find DOI for paper ID {ssid}. Code={response.status_code}, text={response.text}"
    )

get_title_and_id_from_doi(doi: str) -> Dict[str, Any]

Given a DOI, retrieves the paper's title and semantic scholar paper ID.

Parameters:

Name Type Description Default
doi str

The DOI of the paper (e.g., "10.18653/v1/N18-3011").

required

Returns:

Type Description
Dict[str, Any]

dict or None: A dictionary with keys 'title' and 'ssid'.

Source code in paperscraper/citations/utils.py
def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
    """
    Given a DOI, retrieves the paper's title and semantic scholar paper ID.

    Parameters:
        doi (str): The DOI of the paper (e.g., "10.18653/v1/N18-3011").

    Returns:
        dict or None: A dictionary with keys 'title' and 'ssid'.
    """

    # Send the GET request to Semantic Scholar
    response = requests.get(f"{PAPER_URL}DOI:{doi}")
    if response.status_code == 200:
        data = response.json()
        return {"title": data.get("title"), "ssid": data.get("paperId")}
    logger.warning(
        f"Could not get authors & semantic scholar ID for DOI={doi}, {response.status_code}: {response.text}"
    )

author_name_to_ssaid(author_name: str) -> str

Given an author name, returns the Semantic Scholar author ID.

Parameters:

Name Type Description Default
author_name str

The full name of the author.

required

Returns:

Type Description
str

str or None: The Semantic Scholar author ID or None if no author is found.

Source code in paperscraper/citations/utils.py
def author_name_to_ssaid(author_name: str) -> str:
    """
    Given an author name, returns the Semantic Scholar author ID.

    Parameters:
        author_name (str): The full name of the author.

    Returns:
        str or None: The Semantic Scholar author ID or None if no author is found.
    """

    response = requests.get(
        AUTHOR_URL, params={"query": author_name, "fields": "name", "limit": 1}
    )
    if response.status_code == 200:
        data = response.json()
        authors = data.get("data", [])
        if authors:
            # Return the Semantic Scholar author ID from the first result.
            return authors[0].get("authorId")

    logger.error(
        f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
    )

determine_paper_input_type(input: str) -> Literal['ssid', 'doi', 'title']

Determines the intended input type by the user if not explicitly given (infer).

Parameters:

Name Type Description Default
input str

Either a DOI or a semantic scholar paper ID or an author name.

required

Returns:

Type Description
Literal['ssid', 'doi', 'title']

The input type

Source code in paperscraper/citations/utils.py
def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
    """
    Determines the intended input type by the user if not explicitly given (`infer`).

    Args:
        input: Either a DOI or a semantic scholar paper ID or an author name.

    Returns:
        The input type
    """
    if len(input) > 15 and " " not in input and (input.isalnum() and input.islower()):
        mode = "ssid"
    elif len(re.findall(DOI_PATTERN, input, re.IGNORECASE)) == 1:
        mode = "doi"
    else:
        logger.info(
            f"Assuming `{input}` is a paper title, since it seems neither a DOI nor a paper ID"
        )
        mode = "title"
    return mode

get_papers_for_author(ss_author_id: str) -> List[str] async

Given a Semantic Scholar author ID, returns a list of all Semantic Scholar paper IDs for that author.

Parameters:

Name Type Description Default
ss_author_id str

The Semantic Scholar author ID (e.g., "1741101").

required

Returns:

Type Description
List[str]

A list of paper IDs (as strings) authored by the given author.

Source code in paperscraper/citations/utils.py
async def get_papers_for_author(ss_author_id: str) -> List[str]:
    """
    Given a Semantic Scholar author ID, returns a list of all Semantic Scholar paper IDs for that author.

    Args:
        ss_author_id (str): The Semantic Scholar author ID (e.g., "1741101").

    Returns:
        A list of paper IDs (as strings) authored by the given author.
    """
    papers = []
    offset = 0
    limit = 100

    async with httpx.AsyncClient() as client:
        while True:
            response = await client.get(
                f"https://api.semanticscholar.org/graph/v1/author/{ss_author_id}/papers",
                params={"fields": "paperId", "offset": offset, "limit": limit},
            )
            response.raise_for_status()
            data = response.json()
            page = data.get("data", [])

            # Extract paper IDs from the current page.
            for paper in page:
                if "paperId" in paper:
                    papers.append(paper["paperId"])

            # If fewer papers were returned than the limit, we've reached the end.
            if len(page) < limit:
                break

            offset += limit

    return papers

find_matching(first: List[Dict[str, str]], second: List[Dict[str, str]]) -> List[str]

Ingests two sets of authors and returns a list of those that match (either based on name or on author ID).

Parameters:

Name Type Description Default
first List[Dict[str, str]]

First set of authors given as list of dict with two keys (authorID and name).

required
second List[Dict[str, str]]

Second set of authors given as list of dict with two same keys.

required

Returns:

Type Description
List[str]

List of names of authors in first list where a match was found.

Source code in paperscraper/citations/utils.py
def find_matching(
    first: List[Dict[str, str]], second: List[Dict[str, str]]
) -> List[str]:
    """
    Ingests two sets of authors and returns a list of those that match (either based on name
        or on author ID).

    Args:
        first: First set of authors given as list of dict with two keys (`authorID` and `name`).
        second: Second set of authors given as list of dict with two same keys.

    Returns:
        List of names of authors in first list where a match was found.
    """
    # Check which author IDs overlap
    second_names = set(map(lambda x: x["authorId"], second))
    overlap_ids = {f["name"] for f in first if f["authorId"] in second_names}

    overlap_names = {
        f["name"]
        for f in first
        if f["authorId"] not in overlap_ids
        and any([check_overlap(f["name"], s["name"]) for s in second])
    }
    return list(overlap_ids | overlap_names)

check_overlap(n1: str, n2: str) -> bool

Check whether two author names are identical. TODO: This can be made more robust

Parameters:

Name Type Description Default
n1 str

first name

required
n2 str

second name

required

Returns:

Name Type Description
bool bool

Whether names are identical.

Source code in paperscraper/citations/utils.py
def check_overlap(n1: str, n2: str) -> bool:
    """
    Check whether two author names are identical.
    TODO: This can be made more robust

    Args:
        n1: first name
        n2: second name

    Returns:
        bool: Whether names are identical.
    """
    # remove initials and check for name intersection
    s1 = {w for w in clean_name(n1).split()}
    s2 = {w for w in clean_name(n2).split()}
    return len(s2) > 0 and len(s1 | s2) == len(s1)

clean_name(s: str) -> str

Clean up a str by removing special characters.

Parameters:

Name Type Description Default
s str

Input possibly containing special symbols

required

Returns:

Type Description
str

Homogenized string.

Source code in paperscraper/citations/utils.py
def clean_name(s: str) -> str:
    """
    Clean up a str by removing special characters.

    Args:
        s: Input possibly containing special symbols

    Returns:
        Homogenized string.
    """
    return "".join(ch for ch in unidecode(s) if ch.isalpha() or ch.isspace()).lower()