Skip to content

paperscraper.citations

paperscraper.citations

citations

get_citations_by_doi(doi: str) -> int

Get the number of citations of a paper according to semantic scholar.

Parameters:

Name Type Description Default
doi str

the DOI of the paper.

required

Returns:

Type Description
int

The number of citations

Source code in paperscraper/citations/citations.py
def get_citations_by_doi(doi: str) -> int:
    """
    Get the number of citations of a paper according to semantic scholar.

    Args:
        doi: the DOI of the paper.

    Returns:
        The number of citations
    """

    try:
        paper = sch.get_paper(doi)
        citations = len(paper["citations"])
    except SemanticScholarException.ObjectNotFoundException:
        logger.warning(f"Could not find paper {doi}, assuming 0 citation.")
        citations = 0
    except ConnectionRefusedError as e:
        logger.warning(f"Waiting for 10 sec since {doi} gave: {e}")
        sleep(10)
        citations = len(sch.get_paper(doi)["citations"])
    finally:
        return citations

get_citations_from_title(title: str) -> int

Parameters:

Name Type Description Default
title str

Title of paper to be searched on Scholar.

required

Raises:

Type Description
TypeError

If sth else than str is passed.

Returns:

Name Type Description
int int

Number of citations of paper.

Source code in paperscraper/citations/citations.py
def get_citations_from_title(title: str) -> int:
    """
    Args:
        title (str): Title of paper to be searched on Scholar.

    Raises:
        TypeError: If sth else than str is passed.

    Returns:
        int: Number of citations of paper.
    """

    if not isinstance(title, str):
        raise TypeError(f"Pass str not {type(title)}")

    # Search for exact match
    title = '"' + title.strip() + '"'

    matches = scholarly.search_pubs(title)
    counts = list(map(lambda p: int(p["num_citations"]), matches))
    if len(counts) == 0:
        logger.warning(f"Found no match for {title}.")
        return 0
    if len(counts) > 1:
        logger.warning(f"Found {len(counts)} matches for {title}, returning first one.")
    return counts[0]

entity

core

Entity

An abstract entity class with a set of utilities shared by the objects that perform self-linking analyses, such as Paper and Researcher.

Source code in paperscraper/citations/entity/core.py
class Entity:
    """
    An abstract entity class with a set of utilities shared by the objects that perform
    self-linking analyses, such as Paper and Researcher.
    """

    @abstractmethod
    def self_references(self):
        """
        Has to be implemented by the child class. Performs a self-referencing analyses
        for the object.
        """
        ...

    @abstractmethod
    def self_citations(self):
        """
        Has to be implemented by the child class. Performs a self-citation analyses
        for the object.
        """
        ...

    @abstractmethod
    def get_result(self):
        """
        Has to be implemented by the child class. Provides the result of the analysis.
        """
        ...
self_references() abstractmethod

Has to be implemented by the child class. Performs a self-referencing analyses for the object.

Source code in paperscraper/citations/entity/core.py
@abstractmethod
def self_references(self):
    """
    Has to be implemented by the child class. Performs a self-referencing analyses
    for the object.
    """
    ...
self_citations() abstractmethod

Has to be implemented by the child class. Performs a self-citation analyses for the object.

Source code in paperscraper/citations/entity/core.py
@abstractmethod
def self_citations(self):
    """
    Has to be implemented by the child class. Performs a self-citation analyses
    for the object.
    """
    ...
get_result() abstractmethod

Has to be implemented by the child class. Provides the result of the analysis.

Source code in paperscraper/citations/entity/core.py
@abstractmethod
def get_result(self):
    """
    Has to be implemented by the child class. Provides the result of the analysis.
    """
    ...

paper

Paper

Bases: Entity

Source code in paperscraper/citations/entity/paper.py
class Paper(Entity):
    title: str = ""
    doi: str = ""
    authors: List[str] = []

    def __init__(self, input: str, mode: ModeType = "infer"):
        """
        Set up a Paper object for analysis.

        Args:
            input: Paper identifier. This can be the title, DOI or semantic scholar ID
                of the paper.
            mode: The format in which the ID was provided. Defaults to "infer".

        Raises:
            ValueError: If unknown mode is given.
        """
        if mode not in MODES:
            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

        input = input.strip()
        self.input = input
        if mode == "infer":
            mode = determine_paper_input_type(input)

        if mode == "doi":
            self.doi = input
        elif mode == "title":
            self.doi = get_doi_from_title(input)
        elif mode == "ssid":
            self.doi = get_doi_from_ssid(input)

        if self.doi is not None:
            out = get_title_and_id_from_doi(self.doi)
            if out is not None:
                self.title = out["title"]
                self.ssid = out["ssid"]

    def self_references(self):
        """
        Extracts the self references of a paper, for each author.
        """
        if isinstance(self.doi, str):
            self.self_ref: ReferenceResult = self_references_paper(self.doi)

    def self_citations(self):
        """
        Extracts the self citations of a paper, for each author.
        """
        if isinstance(self.doi, str):
            self.self_cite: CitationResult = self_citations_paper(self.doi)

    def get_result(self) -> Optional[PaperResult]:
        """
        Provides the result of the analysis.

        Returns: PaperResult if available.
        """
        if not hasattr(self, "self_ref"):
            self.self_references()
        if not hasattr(self, "self_cite"):
            self.self_citations()
        return PaperResult(
            title=self.title,
            **{
                k: v
                for k, v in self.self_ref.model_dump().items()
                if k not in ["ssid", "title"]
            },
            **{
                k: v
                for k, v in self.self_cite.model_dump().items()
                if k not in ["title"]
            },
        )
__init__(input: str, mode: ModeType = 'infer')

Set up a Paper object for analysis.

Parameters:

Name Type Description Default
input str

Paper identifier. This can be the title, DOI or semantic scholar ID of the paper.

required
mode ModeType

The format in which the ID was provided. Defaults to "infer".

'infer'

Raises:

Type Description
ValueError

If unknown mode is given.

Source code in paperscraper/citations/entity/paper.py
def __init__(self, input: str, mode: ModeType = "infer"):
    """
    Set up a Paper object for analysis.

    Args:
        input: Paper identifier. This can be the title, DOI or semantic scholar ID
            of the paper.
        mode: The format in which the ID was provided. Defaults to "infer".

    Raises:
        ValueError: If unknown mode is given.
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

    input = input.strip()
    self.input = input
    if mode == "infer":
        mode = determine_paper_input_type(input)

    if mode == "doi":
        self.doi = input
    elif mode == "title":
        self.doi = get_doi_from_title(input)
    elif mode == "ssid":
        self.doi = get_doi_from_ssid(input)

    if self.doi is not None:
        out = get_title_and_id_from_doi(self.doi)
        if out is not None:
            self.title = out["title"]
            self.ssid = out["ssid"]
self_references()

Extracts the self references of a paper, for each author.

Source code in paperscraper/citations/entity/paper.py
def self_references(self):
    """
    Extracts the self references of a paper, for each author.
    """
    if isinstance(self.doi, str):
        self.self_ref: ReferenceResult = self_references_paper(self.doi)
self_citations()

Extracts the self citations of a paper, for each author.

Source code in paperscraper/citations/entity/paper.py
def self_citations(self):
    """
    Extracts the self citations of a paper, for each author.
    """
    if isinstance(self.doi, str):
        self.self_cite: CitationResult = self_citations_paper(self.doi)
get_result() -> Optional[PaperResult]

Provides the result of the analysis.

Returns: PaperResult if available.

Source code in paperscraper/citations/entity/paper.py
def get_result(self) -> Optional[PaperResult]:
    """
    Provides the result of the analysis.

    Returns: PaperResult if available.
    """
    if not hasattr(self, "self_ref"):
        self.self_references()
    if not hasattr(self, "self_cite"):
        self.self_citations()
    return PaperResult(
        title=self.title,
        **{
            k: v
            for k, v in self.self_ref.model_dump().items()
            if k not in ["ssid", "title"]
        },
        **{
            k: v
            for k, v in self.self_cite.model_dump().items()
            if k not in ["title"]
        },
    )

researcher

Researcher

Bases: Entity

Source code in paperscraper/citations/entity/researcher.py
class Researcher(Entity):
    name: str
    ssaid: int
    orcid: Optional[str] = None
    ssids: List[int] = []

    def __init__(self, input: str, mode: ModeType = "infer"):
        """
        Construct researcher object for self citation/reference analysis.

        Args:
            input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
            mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
                Defaults to "infer".

        Raises:
            ValueError: Unknown mode
        """
        if mode not in MODES:
            raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

        input = input.strip()
        if mode == "infer":
            if input.isdigit():
                mode = "ssaid"
            elif (
                input.count("-") == 3
                and len(input) == 19
                and all([x.isdigit() for x in input.split("-")])
            ):
                mode = "orcid"
            else:
                mode = "name"
        if mode == "ssaid":
            self.name = sch.get_author(input)._name
            self.ssaid = input
        elif mode == "orcid":
            orcid_name = orcid_to_author_name(input)
            self.orcid = input
            self.ssaid, self.name = author_name_to_ssaid(orcid_name)
        elif mode == "name":
            self.name = input
            self.ssaid, self.name = author_name_to_ssaid(input)

        self.result = ResearcherResult(
            name=self.name,
            ssaid=int(self.ssaid),
            orcid=self.orcid,
            num_citations=-1,
            num_references=-1,
        )

    async def _self_references_async(
        self, verbose: bool = False
    ) -> List[ReferenceResult]:
        """Async version of self_references."""
        if self.ssaid == "-1":
            return []
        if self.ssids == []:
            self.ssids = await get_papers_for_author(self.ssaid)

        results: List[ReferenceResult] = await self_references_paper(
            self.ssids, verbose=verbose
        )
        # Remove papers with zero references or that are erratum/corrigendum
        results = [
            r
            for r in results
            if r.num_references > 0
            and "erratum" not in r.title.lower()
            and "corrigendum" not in r.title.lower()
        ]

        return results

    def self_references(self, verbose: bool = False) -> ResearcherResult:
        """
        Sifts through all papers of a researcher and extracts the self references.

        Args:
            verbose: If True, logs detailed information for each paper.

        Returns:
            A ResearcherResult containing aggregated self-reference data.
        """
        reference_results = asyncio.run(self._self_references_async(verbose=verbose))

        individual_self_references = {
            getattr(result, "title"): getattr(result, "self_references").get(
                self.name, 0.0
            )
            for result in reference_results
        }
        reference_ratio = sum(individual_self_references.values()) / max(
            1, len(individual_self_references)
        )

        self.result = self.result.model_copy(
            update={
                "num_references": sum(r.num_references for r in reference_results),
                "self_references": dict(
                    sorted(
                        individual_self_references.items(),
                        key=lambda x: x[1],
                        reverse=True,
                    )
                ),
                "self_reference_ratio": round(reference_ratio, 3),
            }
        )

        return self.result

    async def _self_citations_async(
        self, verbose: bool = False
    ) -> List[CitationResult]:
        """Async version of self_citations."""
        if self.ssaid == "-1":
            return []
        if self.ssids == []:
            self.ssids = await get_papers_for_author(self.ssaid)

        results: List[CitationResult] = await self_citations_paper(
            self.ssids, verbose=verbose
        )
        # Remove papers with zero references or that are erratum/corrigendum
        results = [
            r
            for r in results
            if r.num_citations > 0
            and "erratum" not in r.title.lower()
            and "corrigendum" not in r.title.lower()
        ]

        return results

    def self_citations(self, verbose: bool = False) -> ResearcherResult:
        """
        Sifts through all papers of a researcher and finds how often they are self-cited.
        """
        citation_results = asyncio.run(self._self_citations_async(verbose=verbose))
        individual_self_citations = {
            getattr(result, "title"): getattr(result, "self_citations").get(
                self.name, 0.0
            )
            for result in citation_results
        }
        citation_ratio = sum(individual_self_citations.values()) / max(
            1, len(individual_self_citations)
        )

        self.result = self.result.model_copy(
            update={
                "num_citations": sum(r.num_citations for r in citation_results),
                "self_citations": dict(
                    sorted(
                        individual_self_citations.items(),
                        key=lambda x: x[1],
                        reverse=True,
                    )
                ),
                "self_citation_ratio": round(citation_ratio, 3),
            }
        )

        return self.result

    def get_result(self) -> ResearcherResult:
        """
        Provides the result of the analysis.
        """
        if not hasattr(self, "self_ref"):
            self.self_references()
        if not hasattr(self, "self_cite"):
            self.self_citations()
        return self.result
__init__(input: str, mode: ModeType = 'infer')

Construct researcher object for self citation/reference analysis.

Parameters:

Name Type Description Default
input str

A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.

required
mode ModeType

This can be a name orcid (ORCID iD) or ssaid (Semantic Scholar Author ID). Defaults to "infer".

'infer'

Raises:

Type Description
ValueError

Unknown mode

Source code in paperscraper/citations/entity/researcher.py
def __init__(self, input: str, mode: ModeType = "infer"):
    """
    Construct researcher object for self citation/reference analysis.

    Args:
        input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
        mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
            Defaults to "infer".

    Raises:
        ValueError: Unknown mode
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

    input = input.strip()
    if mode == "infer":
        if input.isdigit():
            mode = "ssaid"
        elif (
            input.count("-") == 3
            and len(input) == 19
            and all([x.isdigit() for x in input.split("-")])
        ):
            mode = "orcid"
        else:
            mode = "name"
    if mode == "ssaid":
        self.name = sch.get_author(input)._name
        self.ssaid = input
    elif mode == "orcid":
        orcid_name = orcid_to_author_name(input)
        self.orcid = input
        self.ssaid, self.name = author_name_to_ssaid(orcid_name)
    elif mode == "name":
        self.name = input
        self.ssaid, self.name = author_name_to_ssaid(input)

    self.result = ResearcherResult(
        name=self.name,
        ssaid=int(self.ssaid),
        orcid=self.orcid,
        num_citations=-1,
        num_references=-1,
    )
self_references(verbose: bool = False) -> ResearcherResult

Sifts through all papers of a researcher and extracts the self references.

Parameters:

Name Type Description Default
verbose bool

If True, logs detailed information for each paper.

False

Returns:

Type Description
ResearcherResult

A ResearcherResult containing aggregated self-reference data.

Source code in paperscraper/citations/entity/researcher.py
def self_references(self, verbose: bool = False) -> ResearcherResult:
    """
    Sifts through all papers of a researcher and extracts the self references.

    Args:
        verbose: If True, logs detailed information for each paper.

    Returns:
        A ResearcherResult containing aggregated self-reference data.
    """
    reference_results = asyncio.run(self._self_references_async(verbose=verbose))

    individual_self_references = {
        getattr(result, "title"): getattr(result, "self_references").get(
            self.name, 0.0
        )
        for result in reference_results
    }
    reference_ratio = sum(individual_self_references.values()) / max(
        1, len(individual_self_references)
    )

    self.result = self.result.model_copy(
        update={
            "num_references": sum(r.num_references for r in reference_results),
            "self_references": dict(
                sorted(
                    individual_self_references.items(),
                    key=lambda x: x[1],
                    reverse=True,
                )
            ),
            "self_reference_ratio": round(reference_ratio, 3),
        }
    )

    return self.result
self_citations(verbose: bool = False) -> ResearcherResult

Sifts through all papers of a researcher and finds how often they are self-cited.

Source code in paperscraper/citations/entity/researcher.py
def self_citations(self, verbose: bool = False) -> ResearcherResult:
    """
    Sifts through all papers of a researcher and finds how often they are self-cited.
    """
    citation_results = asyncio.run(self._self_citations_async(verbose=verbose))
    individual_self_citations = {
        getattr(result, "title"): getattr(result, "self_citations").get(
            self.name, 0.0
        )
        for result in citation_results
    }
    citation_ratio = sum(individual_self_citations.values()) / max(
        1, len(individual_self_citations)
    )

    self.result = self.result.model_copy(
        update={
            "num_citations": sum(r.num_citations for r in citation_results),
            "self_citations": dict(
                sorted(
                    individual_self_citations.items(),
                    key=lambda x: x[1],
                    reverse=True,
                )
            ),
            "self_citation_ratio": round(citation_ratio, 3),
        }
    )

    return self.result
get_result() -> ResearcherResult

Provides the result of the analysis.

Source code in paperscraper/citations/entity/researcher.py
def get_result(self) -> ResearcherResult:
    """
    Provides the result of the analysis.
    """
    if not hasattr(self, "self_ref"):
        self.self_references()
    if not hasattr(self, "self_cite"):
        self.self_citations()
    return self.result

orcid

orcid_to_author_name(orcid_id: str) -> Optional[str]

Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'), returns the full name of the author from the ORCID public API.

Source code in paperscraper/citations/orcid.py
def orcid_to_author_name(orcid_id: str) -> Optional[str]:
    """
    Given an ORCID ID (as a string, e.g. '0000-0002-1825-0097'),
    returns the full name of the author from the ORCID public API.
    """

    headers = {"Accept": "application/json"}
    response = requests.get(f"{BASE_URL}{orcid_id}/person", headers=headers)
    if response.status_code == 200:
        data = response.json()
        given = data.get("name", {}).get("given-names", {}).get("value", "")
        family = data.get("name", {}).get("family-name", {}).get("value", "")
        full_name = f"{given} {family}".strip()
        return full_name
    logger.error(
        f"Error fetching ORCID data ({orcid_id}): {response.status_code} {response.text}"
    )

self_citations

self_citations_paper(inputs: Union[str, List[str]], verbose: bool = False) -> Union[CitationResult, List[CitationResult]] async

Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.

Parameters:

Name Type Description Default
inputs Union[str, List[str]]

A single DOI/SSID string or a list of them.

required
verbose bool

If True, logs detailed information for each paper.

False

Returns:

Type Description
Union[CitationResult, List[CitationResult]]

A single CitationResult if a string was passed, else a list of CitationResults.

Source code in paperscraper/citations/self_citations.py
@optional_async
@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
async def self_citations_paper(
    inputs: Union[str, List[str]], verbose: bool = False
) -> Union[CitationResult, List[CitationResult]]:
    """
    Analyze self-citations for one or more papers by DOI or Semantic Scholar ID.

    Args:
        inputs: A single DOI/SSID string or a list of them.
        verbose: If True, logs detailed information for each paper.

    Returns:
        A single CitationResult if a string was passed, else a list of CitationResults.
    """
    single_input = isinstance(inputs, str)
    identifiers = [inputs] if single_input else list(inputs)

    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        tasks = [_process_single(client, ident) for ident in identifiers]
        results = await asyncio.gather(*tasks)

    if verbose:
        for res in results:
            logger.info(
                f'Self-citations in "{res.title}": N={res.num_citations}, Score={res.citation_score}%'
            )
            for author, pct in res.self_citations.items():
                logger.info(f"  {author}: {pct}%")

    return results[0] if single_input else results

self_references

self_references_paper(inputs: Union[str, List[str]], verbose: bool = False) -> Union[ReferenceResult, List[ReferenceResult]] async

Analyze self-references for one or more papers by DOI or Semantic Scholar ID.

Parameters:

Name Type Description Default
inputs Union[str, List[str]]

A single DOI/SSID string or a list of them.

required
verbose bool

If True, logs detailed information for each paper.

False

Returns:

Type Description
Union[ReferenceResult, List[ReferenceResult]]

A single ReferenceResult if a string was passed, else a list of ReferenceResults.

Raises:

Type Description
ValueError

If no references are found for a given identifier.

Source code in paperscraper/citations/self_references.py
@optional_async
async def self_references_paper(
    inputs: Union[str, List[str]], verbose: bool = False
) -> Union[ReferenceResult, List[ReferenceResult]]:
    """
    Analyze self-references for one or more papers by DOI or Semantic Scholar ID.

    Args:
        inputs: A single DOI/SSID string or a list of them.
        verbose: If True, logs detailed information for each paper.

    Returns:
        A single ReferenceResult if a string was passed, else a list of ReferenceResults.

    Raises:
        ValueError: If no references are found for a given identifier.
    """
    single_input = isinstance(inputs, str)
    identifiers = [inputs] if single_input else list(inputs)

    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        tasks = [_process_single_reference(client, ident) for ident in identifiers]
        results: List[ReferenceResult] = []

        iterator = asyncio.as_completed(tasks)
        if verbose:
            iterator = tqdm(
                iterator, total=len(tasks), desc="Collecting self-references"
            )

        for coro in iterator:
            res = await coro
            results.append(res)

    if verbose:
        for res in results:
            logger.info(
                f'Self-references in "{res.title}": N={res.num_references}, '
                f"Score={res.reference_score}%"
            )
            for author, pct in res.self_references.items():
                logger.info(f"  {author}: {pct}% self-references")

    return results[0] if single_input else results

tests

test_self_citations

TestSelfCitations
Source code in paperscraper/citations/tests/test_self_citations.py
class TestSelfCitations:
    @pytest.fixture
    def dois(self):
        return [
            "10.1038/s41586-023-06600-9",
            "ed69978f1594a4e2b9dccfc950490fa1df817ae8",
        ]

    def test_single_doi(self, dois):
        result = self_citations_paper(dois[0])
        assert isinstance(result, CitationResult)
        assert isinstance(result.ssid, str)
        assert isinstance(result.num_citations, int)
        assert result.num_citations > 10
        assert isinstance(result.citation_score, float)
        assert result.citation_score > 0
        for author, self_cites in result.self_citations.items():
            assert isinstance(author, str)
            assert isinstance(self_cites, float)
            assert self_cites >= 0 and self_cites <= 100
        time.sleep(5)

    def test_multiple_dois(self, dois):
        start_time = time.perf_counter()
        result = self_citations_paper(dois)
        async_duration = time.perf_counter() - start_time
        assert isinstance(result, list)
        assert len(result) == len(dois)
        for cit_result in result:
            assert isinstance(cit_result, CitationResult)
            assert isinstance(cit_result.ssid, str)
            assert isinstance(cit_result.num_citations, int)
            assert cit_result.num_citations > 0
            assert cit_result.citation_score > 0
            assert isinstance(cit_result.citation_score, float)
            for author, self_cites in cit_result.self_citations.items():
                assert isinstance(author, str)
                assert isinstance(self_cites, float)
                assert self_cites >= 0 and self_cites <= 100
        time.sleep(5)

        # compare async and sync performance

        # Measure synchronous execution time (three independent calls)
        start_time = time.perf_counter()
        sync_result = [self_citations_paper(doi) for doi in dois]
        sync_duration = time.perf_counter() - start_time

        print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
        print(
            f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
        )

        assert 0.1 * async_duration <= sync_duration, (
            f"Async execution ({async_duration:.2f}s) is slower than sync execution "
            f"({sync_duration:.2f}s)"
        )

        for a, s in zip(result, sync_result):
            assert a == s, f"{a} vs {s}"

    def test_researcher(self):
        """
        Tests calculation of self-references for all papers of an author.
        """
        ssaid = "2328976118"
        researcher = Researcher(ssaid)
        result = researcher.self_citations(verbose=True)
        assert result.ssaid == int(ssaid)
        assert isinstance(result.name, str)
        assert result.name == "Kacper Wyrwal"
        assert isinstance(result.num_references, int)
        assert result.num_references == -1
        assert isinstance(result.num_citations, int)
        assert result.num_citations > 0
        assert isinstance(result.self_citations, Dict)
        for title, ratio in result.self_citations.items():
            assert isinstance(title, str)
            assert isinstance(ratio, float)
            assert ratio >= 0 and ratio <= 100

        assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
        print(result)

    def test_researcher_from_orcid(self):
        """
        Tests calculation of self-references for all papers of an author.
        """
        orcid = "0000-0003-4221-6988"
        researcher = Researcher(orcid)
        result = researcher.self_citations(verbose=True)
        assert result.orcid == orcid
        assert isinstance(result.name, str)
        assert result.name == "Juan M. Galeazzi"
        assert isinstance(result.num_references, int)
        assert result.num_references == -1
        assert isinstance(result.num_citations, int)
        assert result.num_citations > 0
        assert isinstance(result.self_references, Dict)
        for title, ratio in result.self_citations.items():
            assert isinstance(title, str)
            assert isinstance(ratio, float)
            assert ratio >= 0 and ratio <= 100

        assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
        print(result)

    def test_whole_researcher(self):
        ssaid = "2104445902"
        researcher = Researcher(ssaid)
        result = researcher.get_result()
        assert result.ssaid == int(ssaid)
        assert isinstance(result.name, str)
        assert result.name == "Aleksandros Sobczyk"
        assert isinstance(result.num_references, int)
        assert result.num_references > 0
        assert isinstance(result.num_citations, int)
        assert result.num_citations > 0
        assert isinstance(result.self_citations, Dict)
        assert isinstance(result.self_references, Dict)
        assert len(result.self_citations) > 5
        assert len(result.self_references) >= 3
        for title, ratio in result.self_citations.items():
            assert isinstance(title, str)
            assert isinstance(ratio, float)
            assert ratio >= 0 and ratio <= 100
        for title, ratio in result.self_references.items():
            assert isinstance(title, str)
            assert isinstance(ratio, float)
            assert ratio >= 0 and ratio <= 100

        assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
        assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
        print(result)
test_researcher()

Tests calculation of self-references for all papers of an author.

Source code in paperscraper/citations/tests/test_self_citations.py
def test_researcher(self):
    """
    Tests calculation of self-references for all papers of an author.
    """
    ssaid = "2328976118"
    researcher = Researcher(ssaid)
    result = researcher.self_citations(verbose=True)
    assert result.ssaid == int(ssaid)
    assert isinstance(result.name, str)
    assert result.name == "Kacper Wyrwal"
    assert isinstance(result.num_references, int)
    assert result.num_references == -1
    assert isinstance(result.num_citations, int)
    assert result.num_citations > 0
    assert isinstance(result.self_citations, Dict)
    for title, ratio in result.self_citations.items():
        assert isinstance(title, str)
        assert isinstance(ratio, float)
        assert ratio >= 0 and ratio <= 100

    assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
    print(result)
test_researcher_from_orcid()

Tests calculation of self-references for all papers of an author.

Source code in paperscraper/citations/tests/test_self_citations.py
def test_researcher_from_orcid(self):
    """
    Tests calculation of self-references for all papers of an author.
    """
    orcid = "0000-0003-4221-6988"
    researcher = Researcher(orcid)
    result = researcher.self_citations(verbose=True)
    assert result.orcid == orcid
    assert isinstance(result.name, str)
    assert result.name == "Juan M. Galeazzi"
    assert isinstance(result.num_references, int)
    assert result.num_references == -1
    assert isinstance(result.num_citations, int)
    assert result.num_citations > 0
    assert isinstance(result.self_references, Dict)
    for title, ratio in result.self_citations.items():
        assert isinstance(title, str)
        assert isinstance(ratio, float)
        assert ratio >= 0 and ratio <= 100

    assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
    print(result)

test_self_references

TestSelfReferences
Source code in paperscraper/citations/tests/test_self_references.py
class TestSelfReferences:
    @pytest.fixture
    def dois(self):
        return [
            "10.1038/s41586-023-06600-9",
            "10.1016/j.neunet.2014.09.003",
        ]

    def test_single_doi(self, dois):
        result = self_references_paper(dois[0])
        assert isinstance(result, ReferenceResult)
        assert isinstance(result.num_references, int)
        assert result.num_references > 0
        assert isinstance(result.ssid, str)
        assert isinstance(result.reference_score, float)
        assert result.reference_score > 0
        assert isinstance(result.self_references, Dict)
        for author, self_cites in result.self_references.items():
            assert isinstance(author, str)
            assert isinstance(self_cites, float)
            assert self_cites >= 0 and self_cites <= 100

    def test_multiple_dois(self, dois):
        results = self_references_paper(dois[1:])
        assert isinstance(results, list)
        assert len(results) == len(dois[1:])
        for ref_result in results:
            assert isinstance(ref_result, ReferenceResult)
            assert isinstance(ref_result.ssid, str)
            assert isinstance(ref_result.num_references, int)
            assert ref_result.num_references > 0
            assert ref_result.reference_score > 0
            assert isinstance(ref_result.reference_score, float)
            for author, self_cites in ref_result.self_references.items():
                assert isinstance(author, str)
                assert isinstance(self_cites, float)
                assert self_cites >= 0 and self_cites <= 100

    def test_compare_async_and_sync_performance(self, dois):
        """
        Compares the execution time of asynchronous and synchronous `self_references`
        for a list of DOIs.
        """

        start_time = time.perf_counter()
        async_results = self_references_paper(dois)
        async_duration = time.perf_counter() - start_time

        # Measure synchronous execution time (three independent calls)
        start_time = time.perf_counter()
        sync_results = [self_references_paper(doi) for doi in dois]

        sync_duration = time.perf_counter() - start_time

        print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
        print(
            f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
        )
        assert len(sync_results) == len(async_results)

        assert 0.5 * async_duration <= sync_duration, (
            f"Async execution ({async_duration:.2f}s) is slower than sync execution "
            f"({sync_duration:.2f}s)"
        )

    def test_researcher(self):
        """
        Tests calculation of self-references for all papers of an author.
        """
        ssaid = "2326988211"
        researcher = Researcher(ssaid)
        result = researcher.self_references(verbose=True)
        assert result.ssaid == int(ssaid)
        assert isinstance(result.name, str)
        assert result.name == "Patrick Soga"
        assert isinstance(result.num_references, int)
        assert result.num_references > 0
        assert isinstance(result.num_citations, int)
        assert result.num_citations == -1
        assert isinstance(result.self_references, Dict)
        for title, ratio in result.self_references.items():
            assert isinstance(title, str)
            assert isinstance(ratio, float)
            assert ratio >= 0 and ratio <= 100

        assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
        print(result)

    def test_researcher_from_orcid(self):
        """
        Tests calculation of self-references for all papers of an author.
        """
        orcid = "0000-0003-4221-6988"
        researcher = Researcher(orcid)
        result = researcher.self_references(verbose=True)
        assert result.orcid == orcid
        assert isinstance(result.name, str)
        assert result.name == "Juan M. Galeazzi"
        assert isinstance(result.num_references, int)
        assert result.num_references > 0
        assert isinstance(result.num_citations, int)
        assert result.num_citations == -1
        assert isinstance(result.self_references, Dict)
        for title, ratio in result.self_references.items():
            assert isinstance(title, str)
            assert isinstance(ratio, float)
            assert ratio >= 0 and ratio <= 100

        assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
        print(result)
test_compare_async_and_sync_performance(dois)

Compares the execution time of asynchronous and synchronous self_references for a list of DOIs.

Source code in paperscraper/citations/tests/test_self_references.py
def test_compare_async_and_sync_performance(self, dois):
    """
    Compares the execution time of asynchronous and synchronous `self_references`
    for a list of DOIs.
    """

    start_time = time.perf_counter()
    async_results = self_references_paper(dois)
    async_duration = time.perf_counter() - start_time

    # Measure synchronous execution time (three independent calls)
    start_time = time.perf_counter()
    sync_results = [self_references_paper(doi) for doi in dois]

    sync_duration = time.perf_counter() - start_time

    print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
    print(
        f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
    )
    assert len(sync_results) == len(async_results)

    assert 0.5 * async_duration <= sync_duration, (
        f"Async execution ({async_duration:.2f}s) is slower than sync execution "
        f"({sync_duration:.2f}s)"
    )
test_researcher()

Tests calculation of self-references for all papers of an author.

Source code in paperscraper/citations/tests/test_self_references.py
def test_researcher(self):
    """
    Tests calculation of self-references for all papers of an author.
    """
    ssaid = "2326988211"
    researcher = Researcher(ssaid)
    result = researcher.self_references(verbose=True)
    assert result.ssaid == int(ssaid)
    assert isinstance(result.name, str)
    assert result.name == "Patrick Soga"
    assert isinstance(result.num_references, int)
    assert result.num_references > 0
    assert isinstance(result.num_citations, int)
    assert result.num_citations == -1
    assert isinstance(result.self_references, Dict)
    for title, ratio in result.self_references.items():
        assert isinstance(title, str)
        assert isinstance(ratio, float)
        assert ratio >= 0 and ratio <= 100

    assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
    print(result)
test_researcher_from_orcid()

Tests calculation of self-references for all papers of an author.

Source code in paperscraper/citations/tests/test_self_references.py
def test_researcher_from_orcid(self):
    """
    Tests calculation of self-references for all papers of an author.
    """
    orcid = "0000-0003-4221-6988"
    researcher = Researcher(orcid)
    result = researcher.self_references(verbose=True)
    assert result.orcid == orcid
    assert isinstance(result.name, str)
    assert result.name == "Juan M. Galeazzi"
    assert isinstance(result.num_references, int)
    assert result.num_references > 0
    assert isinstance(result.num_citations, int)
    assert result.num_citations == -1
    assert isinstance(result.self_references, Dict)
    for title, ratio in result.self_references.items():
        assert isinstance(title, str)
        assert isinstance(ratio, float)
        assert ratio >= 0 and ratio <= 100

    assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
    print(result)

utils

get_doi_from_title(title: str) -> Optional[str]

Searches the DOI of a paper based on the paper title

Parameters:

Name Type Description Default
title str

Paper title

required

Returns:

Type Description
Optional[str]

DOI according to semantic scholar API

Source code in paperscraper/citations/utils.py
def get_doi_from_title(title: str) -> Optional[str]:
    """
    Searches the DOI of a paper based on the paper title

    Args:
        title: Paper title

    Returns:
        DOI according to semantic scholar API
    """
    response = requests.get(
        PAPER_URL + "search",
        params={"query": title, "fields": "externalIds", "limit": 1},
        headers=HEADERS,
    )
    data = response.json()

    if data.get("data"):
        paper = data["data"][0]
        doi = paper.get("externalIds", {}).get("DOI")
        if doi:
            return doi
    logger.warning(f"Did not find DOI for title={title}")

get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str] async

Given a Semantic Scholar paper ID, returns the corresponding DOI if available.

Parameters:

Name Type Description Default
ssid str

The paper ID on Semantic Scholar.

required

Returns:

Type Description
Optional[str]

str or None: The DOI of the paper, or None if not found or in case of an error.

Source code in paperscraper/citations/utils.py
@optional_async
async def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
    """
    Given a Semantic Scholar paper ID, returns the corresponding DOI if available.

    Parameters:
      ssid (str): The paper ID on Semantic Scholar.

    Returns:
      str or None: The DOI of the paper, or None if not found or in case of an error.
    """
    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        logger.warning(
            "Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
        )
        attempts = 0
        for attempt in tqdm(
            range(1, max_retries + 1), desc=f"Fetching DOI for {ssid}", unit="attempt"
        ):
            # Make the GET request to Semantic Scholar.
            response = await client.get(
                f"{PAPER_URL}{ssid}",
                params={"fields": "externalIds", "limit": 1},
                headers=HEADERS,
            )

            # If successful, try to extract and return the DOI.
            if response.status_code == 200:
                data = response.json()
                doi = data.get("externalIds", {}).get("DOI")
                return doi
            attempts += 1
        logger.warning(
            f"Did not find DOI for paper ID {ssid}. Code={response.status_code}, text={response.text}"
        )

get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None async

Given a DOI, retrieves the paper's title and semantic scholar paper ID.

Parameters:

Name Type Description Default
doi str

The DOI of the paper (e.g., "10.18653/v1/N18-3011").

required

Returns:

Type Description
Dict[str, str] | None

dict or None: A dictionary with keys 'title' and 'ssid'.

Source code in paperscraper/citations/utils.py
@optional_async
async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
    """
    Given a DOI, retrieves the paper's title and semantic scholar paper ID.

    Parameters:
        doi (str): The DOI of the paper (e.g., "10.18653/v1/N18-3011").

    Returns:
        dict or None: A dictionary with keys 'title' and 'ssid'.
    """
    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        # Send the GET request to Semantic Scholar
        response = await client.get(f"{PAPER_URL}DOI:{doi}", headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            return {"title": data.get("title"), "ssid": data.get("paperId")}
        logger.warning(
            f"Could not get authors & semantic scholar ID for DOI={doi}, {response.status_code}: {response.text}"
        )

author_name_to_ssaid(author_name: str) -> Tuple[str, str] async

Given an author name, returns the Semantic Scholar author ID.

Parameters:

Name Type Description Default
author_name str

The full name of the author.

required

Returns:

Type Description
Tuple[str, str]

Tuple[str, str] or None: The SS author ID alongside the SS name (may differ slightly from input name) or None if no author is found.

Source code in paperscraper/citations/utils.py
@optional_async
@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
async def author_name_to_ssaid(author_name: str) -> Tuple[str, str]:
    """
    Given an author name, returns the Semantic Scholar author ID.

    Parameters:
        author_name (str): The full name of the author.

    Returns:
        Tuple[str, str] or None: The SS author ID alongside the SS name (may differ
            slightly from input name) or None if no author is found.
    """
    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
        response = await client.get(
            AUTHOR_URL,
            params={"query": author_name, "fields": "name", "limit": 1},
            headers=HEADERS,
        )
        response.raise_for_status()
        data = response.json()
        authors = data.get("data", [])
        if authors:
            # Return the Semantic Scholar author ID from the first result.
            return authors[0]["authorId"], authors[0]["name"]

        logger.error(
            f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
        )
        return ('-1', 'N.A.')

determine_paper_input_type(input: str) -> Literal['ssid', 'doi', 'title']

Determines the intended input type by the user if not explicitly given (infer).

Parameters:

Name Type Description Default
input str

Either a DOI or a semantic scholar paper ID or an author name.

required

Returns:

Type Description
Literal['ssid', 'doi', 'title']

The input type

Source code in paperscraper/citations/utils.py
def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
    """
    Determines the intended input type by the user if not explicitly given (`infer`).

    Args:
        input: Either a DOI or a semantic scholar paper ID or an author name.

    Returns:
        The input type
    """
    if len(input) > 15 and " " not in input and (input.isalnum() and input.islower()):
        mode = "ssid"
    elif len(re.findall(DOI_PATTERN, input, re.IGNORECASE)) == 1:
        mode = "doi"
    else:
        logger.info(
            f"Assuming `{input}` is a paper title, since it seems neither a DOI nor a paper ID"
        )
        mode = "title"
    return mode

get_papers_for_author(ss_author_id: str) -> List[str] async

Given a Semantic Scholar author ID, returns a list of all Semantic Scholar paper IDs for that author.

Parameters:

Name Type Description Default
ss_author_id str

The Semantic Scholar author ID (e.g., "1741101").

required

Returns:

Type Description
List[str]

A list of paper IDs (as strings) authored by the given author.

Source code in paperscraper/citations/utils.py
@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
async def get_papers_for_author(ss_author_id: str) -> List[str]:
    """
    Given a Semantic Scholar author ID, returns a list of all Semantic Scholar paper IDs for that author.

    Args:
        ss_author_id (str): The Semantic Scholar author ID (e.g., "1741101").

    Returns:
        A list of paper IDs (as strings) authored by the given author.
    """
    papers = []
    offset = 0
    limit = 100

    async with httpx.AsyncClient() as client:
        while True:
            response = await client.get(
                f"https://api.semanticscholar.org/graph/v1/author/{ss_author_id}/papers",
                params={"fields": "paperId", "offset": offset, "limit": limit},
            )
            response.raise_for_status()
            data = response.json()
            page = data.get("data", [])

            # Extract paper IDs from the current page.
            for paper in page:
                if "paperId" in paper:
                    papers.append(paper["paperId"])

            # If fewer papers were returned than the limit, we've reached the end.
            if len(page) < limit:
                break

            offset += limit

    return papers

find_matching(first: List[Dict[str, str]], second: List[Dict[str, str]]) -> List[str]

Ingests two sets of authors and returns a list of those that match (either based on name or on author ID).

Parameters:

Name Type Description Default
first List[Dict[str, str]]

First set of authors given as list of dict with two keys (authorID and name).

required
second List[Dict[str, str]]

Second set of authors given as list of dict with two same keys.

required

Returns:

Type Description
List[str]

List of names of authors in first list where a match was found.

Source code in paperscraper/citations/utils.py
def find_matching(
    first: List[Dict[str, str]], second: List[Dict[str, str]]
) -> List[str]:
    """
    Ingests two sets of authors and returns a list of those that match (either based on name
        or on author ID).

    Args:
        first: First set of authors given as list of dict with two keys (`authorID` and `name`).
        second: Second set of authors given as list of dict with two same keys.

    Returns:
        List of names of authors in first list where a match was found.
    """
    # Check which author IDs overlap
    second_names = set(map(lambda x: x["authorId"], second))
    overlap_ids = {f["name"] for f in first if f["authorId"] in second_names}

    overlap_names = {
        f["name"]
        for f in first
        if f["authorId"] not in overlap_ids
        and any([check_overlap(f["name"], s["name"]) for s in second])
    }
    return list(overlap_ids | overlap_names)

check_overlap(n1: str, n2: str) -> bool

Check whether two author names are identical.

Heuristics
  • Case insensitive
  • If name sets are identical, a match is assumed (e.g. "John Walter" vs "Walter John").
  • Assume the last token is the surname and require:
    • same surname
    • both have at least one given name
    • first given names are compatible (same, or initial vs full)

Parameters:

Name Type Description Default
n1 str

first name (e.g., "John A. Smith")

required
n2 str

second name (e.g., "J. Smith")

required

Returns:

Name Type Description
bool bool

Whether names are identical.

Source code in paperscraper/citations/utils.py
def check_overlap(n1: str, n2: str) -> bool:
    """
    Check whether two author names are identical.

    Heuristics:
        - Case insensitive
        - If name sets are identical, a match is assumed (e.g. "John Walter" vs "Walter John").
        - Assume the last token is the surname and require:
            * same surname
            * both have at least one given name
            * first given names are compatible (same, or initial vs full)

    Args:
        n1: first name (e.g., "John A. Smith")
        n2: second name (e.g., "J. Smith")

    Returns:
        bool: Whether names are identical.
    """
    t1 = [w for w in clean_name(n1).split() if w]
    t2 = [w for w in clean_name(n2).split() if w]

    if not t1 or not t2:
        return False  # One name is empty after cleaning

    if set(t1) == set(t2):
        return True  # Name sets are identical

    # Assume last token is surname
    surname1, given1 = t1[-1], t1[:-1]
    surname2, given2 = t2[-1], t2[:-1]

    if surname1 != surname2:
        return False  # Surnames do not match

    if not given1 or not given2:
        return False  # One name has no given names

    # Compare only the *first* given name; middle names are optional
    return (
        given1[0] == given2[0]
        or (len(given1[0]) == 1 and given2[0].startswith(given1[0]))
        or (len(given2[0]) == 1 and given1[0].startswith(given2[0]))
    )

clean_name(s: str) -> str

Clean up a str by removing special characters.

Parameters:

Name Type Description Default
s str

Input possibly containing special symbols

required

Returns:

Type Description
str

Homogenized string.

Source code in paperscraper/citations/utils.py
def clean_name(s: str) -> str:
    """
    Clean up a str by removing special characters.

    Args:
        s: Input possibly containing special symbols

    Returns:
        Homogenized string.
    """
    return "".join(ch for ch in unidecode(s) if ch.isalpha() or ch.isspace()).lower()