paperscraper.citations

`paperscraper.citations` ¶

`citations` ¶

`get_citations_by_doi(doi: str) -> int` ¶

Get the number of citations of a paper according to semantic scholar.

Parameters:

Name	Type	Description	Default
`doi`	`str`	the DOI of the paper.	required

Returns:

Type	Description
`int`	The number of citations

Source code in paperscraper/citations/citations.py

def get_citations_by_doi(doi: str) -> int:
    """
    Get the number of citations of a paper according to semantic scholar.

    Args:
        doi: the DOI of the paper.

    Returns:
        The number of citations
    """

    try:
        paper = sch.get_paper(doi)
        citations = len(paper["citations"])
    except SemanticScholarException.ObjectNotFoundException:
        logger.warning(f"Could not find paper {doi}, assuming 0 citation.")
        citations = 0
    except ConnectionRefusedError as e:
        logger.warning(f"Waiting for 10 sec since {doi} gave: {e}")
        sleep(10)
        citations = len(sch.get_paper(doi)["citations"])
    finally:
        return citations

`get_citations_from_title(title: str) -> int` ¶

Parameters:

Name	Type	Description	Default
`title`	`str`	Title of paper to be searched on Scholar.	required

Raises:

Type	Description
`TypeError`	If sth else than str is passed.

Returns:

Name	Type	Description
`int`	`int`	Number of citations of paper.

Source code in paperscraper/citations/citations.py

def get_citations_from_title(title: str) -> int:
    """
    Args:
        title (str): Title of paper to be searched on Scholar.

    Raises:
        TypeError: If sth else than str is passed.

    Returns:
        int: Number of citations of paper.
    """

    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    # Search for exact match
    title = '"' + title.strip() + '"'

    matches = scholarly.search_pubs(title)
    counts = list(map(lambda p: int(p["num_citations"]), matches))
    if len(counts) == 0:
        logger.warning(f"Found no match for {title}.")
        return 0
    if len(counts) > 1:
        logger.warning(f"Found {len(counts)} matches for {title}, returning first one.")
    return counts[0]

`entity` ¶

`core` ¶

`Entity` ¶

An abstract entity class with a set of utilities shared by the objects that perform self-linking analyses, such as Paper and Researcher.

Source code in paperscraper/citations/entity/core.py

class Entity:
    """
    An abstract entity class with a set of utilities shared by the objects that perform
    self-linking analyses, such as Paper and Researcher.
    """

    @abstractmethod
    def self_references(self):
        """
        Has to be implemented by the child class. Performs a self-referencing analyses
        for the object.
        """
        ...

    @abstractmethod
    def self_citations(self):
        """
        Has to be implemented by the child class. Performs a self-citation analyses
        for the object.
        """
        ...

    @abstractmethod
    def get_result(self):
        """
        Has to be implemented by the child class. Provides the result of the analysis.
        """
        ...

`self_references()` `abstractmethod` ¶

Has to be implemented by the child class. Performs a self-referencing analyses for the object.

Source code in paperscraper/citations/entity/core.py

@abstractmethod
def self_references(self):
    """
    Has to be implemented by the child class. Performs a self-referencing analyses
    for the object.
    """
    ...

`self_citations()` `abstractmethod` ¶

Has to be implemented by the child class. Performs a self-citation analyses for the object.

Source code in paperscraper/citations/entity/core.py

@abstractmethod
def self_citations(self):
    """
    Has to be implemented by the child class. Performs a self-citation analyses
    for the object.
    """
    ...

`get_result()` `abstractmethod` ¶

Has to be implemented by the child class. Provides the result of the analysis.

Source code in paperscraper/citations/entity/core.py

@abstractmethod
def get_result(self):
    """
    Has to be implemented by the child class. Provides the result of the analysis.
    """
    ...

`paper` ¶

`Paper` ¶

Bases: Entity

Source code in paperscraper/citations/entity/paper.py

class Paper(Entity):
    title: str = ""
    doi: str = ""
    authors: List[str] = []

    def __init__(self, input: str, mode: ModeType = "infer"):
        """
        Set up a Paper object for analysis.

        Args:
            input: Paper identifier. This can be the title, DOI or semantic scholar ID
                of the paper.
            mode: The format in which the ID was provided. Defaults to "infer".

        Raises:
            ValueError: If unknown mode is given.
        """
        if mode not in MODES:
            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

        input = input.strip()
        self.input = input
        if mode == "infer":
            mode = determine_paper_input_type(input)

        if mode == "doi":
            self.doi = input
        elif mode == "title":
            self.doi = get_doi_from_title(input)
        elif mode == "ssid":
            self.doi = get_doi_from_ssid(input)

        if self.doi is not None:
            out = get_title_and_id_from_doi(self.doi)
            if out is not None:
                self.title = out["title"]
                self.ssid = out["ssid"]

    def self_references(self):
        """
        Extracts the self references of a paper, for each author.
        """
        if isinstance(self.doi, str):
            self.ref_result: ReferenceResult = self_references_paper(self.doi)

    def self_citations(self):
        """
        Extracts the self citations of a paper, for each author.
        """
        if isinstance(self.doi, str):
            self.citation_result: CitationResult = self_citations_paper(self.doi)

    def get_result(self) -> Optional[PaperResult]:
        """
        Provides the result of the analysis.

        Returns: PaperResult if available.
        """
        if not hasattr(self, "ref_result"):
            logger.warning(
                f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
            )
            return
        elif not hasattr(self, "citation_result"):
            logger.warning(
                f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
            )
            return
        ref_result = self.ref_result.model_dump()
        ref_result.pop("ssid", None)
        return PaperResult(
            title=self.title, **ref_result, **self.citation_result.model_dump()
        )

`init(input: str, mode: ModeType = 'infer')` ¶

Set up a Paper object for analysis.

Parameters:

Name	Type	Description	Default
`input`	`str`	Paper identifier. This can be the title, DOI or semantic scholar ID of the paper.	required
`mode`	`ModeType`	The format in which the ID was provided. Defaults to "infer".	`'infer'`

Raises:

Type	Description
`ValueError`	If unknown mode is given.

Source code in paperscraper/citations/entity/paper.py

def __init__(self, input: str, mode: ModeType = "infer"):
    """
    Set up a Paper object for analysis.

    Args:
        input: Paper identifier. This can be the title, DOI or semantic scholar ID
            of the paper.
        mode: The format in which the ID was provided. Defaults to "infer".

    Raises:
        ValueError: If unknown mode is given.
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

    input = input.strip()
    self.input = input
    if mode == "infer":
        mode = determine_paper_input_type(input)

    if mode == "doi":
        self.doi = input
    elif mode == "title":
        self.doi = get_doi_from_title(input)
    elif mode == "ssid":
        self.doi = get_doi_from_ssid(input)

    if self.doi is not None:
        out = get_title_and_id_from_doi(self.doi)
        if out is not None:
            self.title = out["title"]
            self.ssid = out["ssid"]

`self_references()` ¶

Extracts the self references of a paper, for each author.

Source code in paperscraper/citations/entity/paper.py

def self_references(self):
    """
    Extracts the self references of a paper, for each author.
    """
    if isinstance(self.doi, str):
        self.ref_result: ReferenceResult = self_references_paper(self.doi)

`self_citations()` ¶

Extracts the self citations of a paper, for each author.

Source code in paperscraper/citations/entity/paper.py

def self_citations(self):
    """
    Extracts the self citations of a paper, for each author.
    """
    if isinstance(self.doi, str):
        self.citation_result: CitationResult = self_citations_paper(self.doi)

`get_result() -> Optional[PaperResult]` ¶

Provides the result of the analysis.

Returns: PaperResult if available.

Source code in paperscraper/citations/entity/paper.py

def get_result(self) -> Optional[PaperResult]:
    """
    Provides the result of the analysis.

    Returns: PaperResult if available.
    """
    if not hasattr(self, "ref_result"):
        logger.warning(
            f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
        )
        return
    elif not hasattr(self, "citation_result"):
        logger.warning(
            f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
        )
        return
    ref_result = self.ref_result.model_dump()
    ref_result.pop("ssid", None)
    return PaperResult(
        title=self.title, **ref_result, **self.citation_result.model_dump()
    )

`researcher` ¶

`Researcher` ¶

Bases: Entity

Source code in paperscraper/citations/entity/researcher.py

class Researcher(Entity):
    name: str
    ssid: int
    orcid: Optional[str] = None

    def __init__(self, input: str, mode: ModeType = "infer"):
        """
        Construct researcher object for self citation/reference analysis.

        Args:
            input: A researcher to search for.
            mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
                Defaults to "infer".

        Raises:
            ValueError: Unknown mode
        """
        if mode not in MODES:
            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

        input = input.strip()
        if mode == "infer":
            if input.isdigit():
                mode = "ssaid"
            elif (
                input.count("-") == 3
                and len(input) == 19
                and all([x.isdigit() for x in input.split("-")])
            ):
                mode = "orcid"
            else:
                mode = "author"

        if mode == "ssaid":
            self.author = sch.get_author(input)
            self.ssid = input
        elif mode == "orcid":
            self.author = orcid_to_author_name(input)
            self.orcid = input
            self.ssid = author_name_to_ssaid(input)
        elif mode == "author":
            self.author = input
            self.ssid = author_name_to_ssaid(input)

        # TODO: Skip over erratum / corrigendum
        self.ssids = get_papers_for_author(self.ssid)

    def self_references(self):
        """
        Sifts through all papers of a researcher and extracts the self references.
        """
        # TODO: Asynchronous call to self_references
        print("Going through SSIDs", self.ssids)

        # TODO: Aggregate results

    def self_citations(self):
        """
        Sifts through all papers of a researcher and finds how often they are self-cited.
        """
        ...

    def get_result(self) -> ResearcherResult:
        """
        Provides the result of the analysis.
        """
        ...

`init(input: str, mode: ModeType = 'infer')` ¶

Construct researcher object for self citation/reference analysis.

Parameters:

Name	Type	Description	Default
`input`	`str`	A researcher to search for.	required
`mode`	`ModeType`	This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID). Defaults to "infer".	`'infer'`

Raises:

Type	Description
`ValueError`	Unknown mode

Source code in paperscraper/citations/entity/researcher.py

def __init__(self, input: str, mode: ModeType = "infer"):
    """
    Construct researcher object for self citation/reference analysis.

    Args:
        input: A researcher to search for.
        mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
            Defaults to "infer".

    Raises:
        ValueError: Unknown mode
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

    input = input.strip()
    if mode == "infer":
        if input.isdigit():
            mode = "ssaid"
        elif (
            input.count("-") == 3
            and len(input) == 19
            and all([x.isdigit() for x in input.split("-")])
        ):
            mode = "orcid"
        else:
            mode = "author"

    if mode == "ssaid":
        self.author = sch.get_author(input)
        self.ssid = input
    elif mode == "orcid":
        self.author = orcid_to_author_name(input)
        self.orcid = input
        self.ssid = author_name_to_ssaid(input)
    elif mode == "author":
        self.author = input
        self.ssid = author_name_to_ssaid(input)

    # TODO: Skip over erratum / corrigendum
    self.ssids = get_papers_for_author(self.ssid)

`self_references()` ¶

Sifts through all papers of a researcher and extracts the self references.

Source code in paperscraper/citations/entity/researcher.py

def self_references(self):
    """
    Sifts through all papers of a researcher and extracts the self references.
    """
    # TODO: Asynchronous call to self_references
    print("Going through SSIDs", self.ssids)

`self_citations()` ¶

Sifts through all papers of a researcher and finds how often they are self-cited.

Source code in paperscraper/citations/entity/researcher.py

def self_citations(self):
    """
    Sifts through all papers of a researcher and finds how often they are self-cited.
    """
    ...

`get_result() -> ResearcherResult` ¶

Provides the result of the analysis.

Source code in paperscraper/citations/entity/researcher.py

def get_result(self) -> ResearcherResult:
    """
    Provides the result of the analysis.
    """
    ...

`orcid` ¶

`orcid_to_author_name(orcid_id: str) -> Optional[str]` ¶

Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'), returns the full name of the author from the ORCID public API.

Source code in paperscraper/citations/orcid.py

def orcid_to_author_name(orcid_id: str) -> Optional[str]:
    """
    Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'),
    returns the full name of the author from the ORCID public API.
    """

    headers = {"Accept": "application/json"}
    response = requests.get(f"{BASE_URL}{orcid_id}/person", headers=headers)
    if response.status_code == 200:
        data = response.json()
        given = data.get("name", {}).get("given-names", {}).get("value", "")
        family = data.get("name", {}).get("family-name", {}).get("value", "")
        full_name = f"{given} {family}".strip()
        return full_name
    logger.error(
        f"Error fetching ORCID data ({orcid_id}): {response.status_code} {response.text}"
    )

`self_citations` ¶

`self_citations_paper(inputs: Union[str, List[str]], verbose: bool = False) -> Union[CitationResult, List[CitationResult]]` `async` ¶

Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.

Parameters:

Name	Type	Description	Default
`inputs`	`Union[str, List[str]]`	A single DOI/SSID string or a list of them.	required
`verbose`	`bool`	If True, logs detailed information for each paper.	`False`

Returns:

Type	Description
`Union[CitationResult, List[CitationResult]]`	A single CitationResult if a string was passed, else a list of CitationResults.

Source code in paperscraper/citations/self_citations.py

@optional_async
@retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
async def self_citations_paper(
    inputs: Union[str, List[str]], verbose: bool = False
) -> Union[CitationResult, List[CitationResult]]:
    """
    Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.

    Args:
        inputs: A single DOI/SSID string or a list of them.
        verbose: If True, logs detailed information for each paper.

    Returns:
        A single CitationResult if a string was passed, else a list of CitationResults.
    """
    single_input = isinstance(inputs, str)
    identifiers = [inputs] if single_input else list(inputs)

    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        tasks = [_process_single(client, ident) for ident in identifiers]
        results = await asyncio.gather(*tasks)

    if verbose:
        for res in results:
            logger.info(
                f'Self-citations in "{res.ssid}": N={res.num_citations}, Score={res.citation_score}%'
            )
            for author, pct in res.self_citations.items():
                logger.info(f"  {author}: {pct}%")

    return results[0] if single_input else results

`self_references` ¶

`self_references_paper(inputs: Union[str, List[str]], verbose: bool = False) -> Union[ReferenceResult, List[ReferenceResult]]` `async` ¶

Analyze self-references for one or more papers by DOI or Semantic Scholar ID.

Parameters:

Name	Type	Description	Default
`inputs`	`Union[str, List[str]]`	A single DOI/SSID string or a list of them.	required
`verbose`	`bool`	If True, logs detailed information for each paper.	`False`

Returns:

Type	Description
`Union[ReferenceResult, List[ReferenceResult]]`	A single ReferenceResult if a string was passed, else a list of ReferenceResults.

Raises:

Type	Description
`ValueError`	If no references are found for a given identifier.

Source code in paperscraper/citations/self_references.py

@optional_async
@retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
async def self_references_paper(
    inputs: Union[str, List[str]], verbose: bool = False
) -> Union[ReferenceResult, List[ReferenceResult]]:
    """
    Analyze self-references for one or more papers by DOI or Semantic Scholar ID.

    Args:
        inputs: A single DOI/SSID string or a list of them.
        verbose: If True, logs detailed information for each paper.

    Returns:
        A single ReferenceResult if a string was passed, else a list of ReferenceResults.

    Raises:
        ValueError: If no references are found for a given identifier.
    """
    single_input = isinstance(inputs, str)
    identifiers = [inputs] if single_input else list(inputs)

    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        tasks = [_process_single_reference(client, ident) for ident in identifiers]
        results = await asyncio.gather(*tasks)

    if verbose:
        for res in results:
            logger.info(
                f'Self-references in "{res.ssid}": N={res.num_references}, '
                f"Score={res.reference_score}%"
            )
            for author, pct in res.self_references.items():
                logger.info(f"  {author}: {pct}% self-reference")

    return results[0] if single_input else results

`tests` ¶

`test_self_references` ¶

`TestSelfReferences` ¶

Source code in paperscraper/citations/tests/test_self_references.py

class TestSelfReferences:
    @pytest.fixture
    def dois(self):
        return [
            "10.1038/s41586-023-06600-9",
            "10.1016/j.neunet.2014.09.003",
        ]

    def test_single_doi(self, dois):
        result = self_references_paper(dois[0])
        assert isinstance(result, ReferenceResult)
        assert isinstance(result.num_references, int)
        assert result.num_references > 0
        assert isinstance(result.ssid, str)
        assert isinstance(result.reference_score, float)
        assert result.reference_score > 0
        assert isinstance(result.self_references, Dict)
        for author, self_cites in result.self_references.items():
            assert isinstance(author, str)
            assert isinstance(self_cites, float)
            assert self_cites >= 0 and self_cites <= 100

    def test_multiple_dois(self, dois):
        results = self_references_paper(dois[1:])
        assert isinstance(results, list)
        assert len(results) == len(dois[1:])
        for ref_result in results:
            assert isinstance(ref_result, ReferenceResult)
            assert isinstance(ref_result.ssid, str)
            assert isinstance(ref_result.num_references, int)
            assert ref_result.num_references > 0
            assert ref_result.reference_score > 0
            assert isinstance(ref_result.reference_score, float)
            for author, self_cites in ref_result.self_references.items():
                assert isinstance(author, str)
                assert isinstance(self_cites, float)
                assert self_cites >= 0 and self_cites <= 100

    def test_compare_async_and_sync_performance(self, dois):
        """
        Compares the execution time of asynchronous and synchronous `self_references`
        for a list of DOIs.
        """

        start_time = time.perf_counter()
        async_results = self_references_paper(dois)
        async_duration = time.perf_counter() - start_time

        # Measure synchronous execution time (three independent calls)
        start_time = time.perf_counter()
        sync_results = [self_references_paper(doi) for doi in dois]

        sync_duration = time.perf_counter() - start_time

        print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
        print(
            f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
        )
        for a, s in zip(async_results, sync_results):
            assert a == s, f"{a} vs {s}"

        assert 0.5 * async_duration <= sync_duration, (
            f"Async execution ({async_duration:.2f}s) is slower than sync execution "
            f"({sync_duration:.2f}s)"
        )

`test_compare_async_and_sync_performance(dois)` ¶

Compares the execution time of asynchronous and synchronous self_references for a list of DOIs.

Source code in paperscraper/citations/tests/test_self_references.py

def test_compare_async_and_sync_performance(self, dois):
    """
    Compares the execution time of asynchronous and synchronous `self_references`
    for a list of DOIs.
    """

    start_time = time.perf_counter()
    async_results = self_references_paper(dois)
    async_duration = time.perf_counter() - start_time

    # Measure synchronous execution time (three independent calls)
    start_time = time.perf_counter()
    sync_results = [self_references_paper(doi) for doi in dois]

    sync_duration = time.perf_counter() - start_time

    print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
    print(
        f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
    )
    for a, s in zip(async_results, sync_results):
        assert a == s, f"{a} vs {s}"

    assert 0.5 * async_duration <= sync_duration, (
        f"Async execution ({async_duration:.2f}s) is slower than sync execution "
        f"({sync_duration:.2f}s)"
    )

`utils` ¶

`get_doi_from_title(title: str) -> Optional[str]` ¶

Searches the DOI of a paper based on the paper title

Parameters:

Name	Type	Description	Default
`title`	`str`	Paper title	required

Returns:

Type	Description
`Optional[str]`	DOI according to semantic scholar API

Source code in paperscraper/citations/utils.py

def get_doi_from_title(title: str) -> Optional[str]:
    """
    Searches the DOI of a paper based on the paper title

    Args:
        title: Paper title

    Returns:
        DOI according to semantic scholar API
    """
    response = requests.get(
        PAPER_URL + "search",
        params={"query": title, "fields": "externalIds", "limit": 1},
    )
    data = response.json()

    if data.get("data"):
        paper = data["data"][0]
        doi = paper.get("externalIds", {}).get("DOI")
        if doi:
            return doi
    logger.warning(f"Did not find DOI for title={title}")

`get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]` ¶

Given a Semantic Scholar paper ID, returns the corresponding DOI if available.

Parameters:

Name	Type	Description	Default
`ssid`	`str`	The paper ID on Semantic Scholar.	required

Returns:

Type	Description
`Optional[str]`	str or None: The DOI of the paper, or None if not found or in case of an error.

Source code in paperscraper/citations/utils.py

def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
    """
    Given a Semantic Scholar paper ID, returns the corresponding DOI if available.

    Parameters:
      ssid (str): The paper ID on Semantic Scholar.

    Returns:
      str or None: The DOI of the paper, or None if not found or in case of an error.
    """
    logger.warning(
        "Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
    )
    attempts = 0
    for attempt in tqdm(
        range(1, max_retries + 1), desc=f"Fetching DOI for {ssid}", unit="attempt"
    ):
        # Make the GET request to Semantic Scholar.
        response = requests.get(
            f"{PAPER_URL}{ssid}", params={"fields": "externalIds", "limit": 1}
        )

        # If successful, try to extract and return the DOI.
        if response.status_code == 200:
            data = response.json()
            doi = data.get("externalIds", {}).get("DOI")
            return doi
        attempts += 1
        sleep(10)
    logger.warning(
        f"Did not find DOI for paper ID {ssid}. Code={response.status_code}, text={response.text}"
    )

`get_title_and_id_from_doi(doi: str) -> Dict[str, Any]` ¶

Given a DOI, retrieves the paper's title and semantic scholar paper ID.

Parameters:

Name	Type	Description	Default
`doi`	`str`	The DOI of the paper (e.g., "10.18653/v1/N18-3011").	required

Returns:

Type	Description
`Dict[str, Any]`	dict or None: A dictionary with keys 'title' and 'ssid'.

Source code in paperscraper/citations/utils.py

def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
    """
    Given a DOI, retrieves the paper's title and semantic scholar paper ID.

    Parameters:
        doi (str): The DOI of the paper (e.g., "10.18653/v1/N18-3011").

    Returns:
        dict or None: A dictionary with keys 'title' and 'ssid'.
    """

    # Send the GET request to Semantic Scholar
    response = requests.get(f"{PAPER_URL}DOI:{doi}")
    if response.status_code == 200:
        data = response.json()
        return {"title": data.get("title"), "ssid": data.get("paperId")}
    logger.warning(
        f"Could not get authors & semantic scholar ID for DOI={doi}, {response.status_code}: {response.text}"
    )

`author_name_to_ssaid(author_name: str) -> str` ¶

Given an author name, returns the Semantic Scholar author ID.

Parameters:

Name	Type	Description	Default
`author_name`	`str`	The full name of the author.	required

Returns:

Type	Description
`str`	str or None: The Semantic Scholar author ID or None if no author is found.

Source code in paperscraper/citations/utils.py

def author_name_to_ssaid(author_name: str) -> str:
    """
    Given an author name, returns the Semantic Scholar author ID.

    Parameters:
        author_name (str): The full name of the author.

    Returns:
        str or None: The Semantic Scholar author ID or None if no author is found.
    """

    response = requests.get(
        AUTHOR_URL, params={"query": author_name, "fields": "name", "limit": 1}
    )
    if response.status_code == 200:
        data = response.json()
        authors = data.get("data", [])
        if authors:
            # Return the Semantic Scholar author ID from the first result.
            return authors[0].get("authorId")

    logger.error(
        f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
    )

`determine_paper_input_type(input: str) -> Literal['ssid', 'doi', 'title']` ¶

Determines the intended input type by the user if not explicitly given (infer).

Parameters:

Name	Type	Description	Default
`input`	`str`	Either a DOI or a semantic scholar paper ID or an author name.	required

Returns:

Type	Description
`Literal['ssid', 'doi', 'title']`	The input type

Source code in paperscraper/citations/utils.py

def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
    """
    Determines the intended input type by the user if not explicitly given (`infer`).

    Args:
        input: Either a DOI or a semantic scholar paper ID or an author name.

    Returns:
        The input type
    """
    if len(input) > 15 and " " not in input and (input.isalnum() and input.islower()):
        mode = "ssid"
    elif len(re.findall(DOI_PATTERN, input, re.IGNORECASE)) == 1:
        mode = "doi"
    else:
        logger.info(
            f"Assuming `{input}` is a paper title, since it seems neither a DOI nor a paper ID"
        )
        mode = "title"
    return mode

`get_papers_for_author(ss_author_id: str) -> List[str]` `async` ¶

Given a Semantic Scholar author ID, returns a list of all Semantic Scholar paper IDs for that author.

Parameters:

Name	Type	Description	Default
`ss_author_id`	`str`	The Semantic Scholar author ID (e.g., "1741101").	required

Returns:

Type	Description
`List[str]`	A list of paper IDs (as strings) authored by the given author.

Source code in paperscraper/citations/utils.py

async def get_papers_for_author(ss_author_id: str) -> List[str]:
    """
    Given a Semantic Scholar author ID, returns a list of all Semantic Scholar paper IDs for that author.

    Args:
        ss_author_id (str): The Semantic Scholar author ID (e.g., "1741101").

    Returns:
        A list of paper IDs (as strings) authored by the given author.
    """
    papers = []
    offset = 0
    limit = 100

    async with httpx.AsyncClient() as client:
        while True:
            response = await client.get(
                f"https://api.semanticscholar.org/graph/v1/author/{ss_author_id}/papers",
                params={"fields": "paperId", "offset": offset, "limit": limit},
            )
            response.raise_for_status()
            data = response.json()
            page = data.get("data", [])

            # Extract paper IDs from the current page.
            for paper in page:
                if "paperId" in paper:
                    papers.append(paper["paperId"])

            # If fewer papers were returned than the limit, we've reached the end.
            if len(page) < limit:
                break

            offset += limit

    return papers

`find_matching(first: List[Dict[str, str]], second: List[Dict[str, str]]) -> List[str]` ¶

Ingests two sets of authors and returns a list of those that match (either based on name or on author ID).

Parameters:

Name	Type	Description	Default
`first`	`List[Dict[str, str]]`	First set of authors given as list of dict with two keys (`authorID` and `name`).	required
`second`	`List[Dict[str, str]]`	Second set of authors given as list of dict with two same keys.	required

Returns:

Type	Description
`List[str]`	List of names of authors in first list where a match was found.

Source code in paperscraper/citations/utils.py

def find_matching(
    first: List[Dict[str, str]], second: List[Dict[str, str]]
) -> List[str]:
    """
    Ingests two sets of authors and returns a list of those that match (either based on name
        or on author ID).

    Args:
        first: First set of authors given as list of dict with two keys (`authorID` and `name`).
        second: Second set of authors given as list of dict with two same keys.

    Returns:
        List of names of authors in first list where a match was found.
    """
    # Check which author IDs overlap
    second_names = set(map(lambda x: x["authorId"], second))
    overlap_ids = {f["name"] for f in first if f["authorId"] in second_names}

    overlap_names = {
        f["name"]
        for f in first
        if f["authorId"] not in overlap_ids
        and any([check_overlap(f["name"], s["name"]) for s in second])
    }
    return list(overlap_ids | overlap_names)

`check_overlap(n1: str, n2: str) -> bool` ¶

Check whether two author names are identical. TODO: This can be made more robust

Parameters:

Name	Type	Description	Default
`n1`	`str`	first name	required
`n2`	`str`	second name	required

Returns:

Name	Type	Description
`bool`	`bool`	Whether names are identical.

Source code in paperscraper/citations/utils.py

def check_overlap(n1: str, n2: str) -> bool:
    """
    Check whether two author names are identical.
    TODO: This can be made more robust

    Args:
        n1: first name
        n2: second name

    Returns:
        bool: Whether names are identical.
    """
    # remove initials and check for name intersection
    s1 = {w for w in clean_name(n1).split()}
    s2 = {w for w in clean_name(n2).split()}
    return len(s2) > 0 and len(s1 | s2) == len(s1)

`clean_name(s: str) -> str` ¶

Clean up a str by removing special characters.

Parameters:

Name	Type	Description	Default
`s`	`str`	Input possibly containing special symbols	required

Returns:

Type	Description
`str`	Homogenized string.

Source code in paperscraper/citations/utils.py

def clean_name(s: str) -> str:
    """
    Clean up a str by removing special characters.

    Args:
        s: Input possibly containing special symbols

    Returns:
        Homogenized string.
    """
    return "".join(ch for ch in unidecode(s) if ch.isalpha() or ch.isspace()).lower()

paperscraper.citations