paperscraper.xrxiv

`paperscraper.xrxiv` ¶

bioRxiv and medRxiv utilities.

`xrxiv_api` ¶

API for bioRxiv and medRXiv.

`XRXivApi` ¶

API class.

Source code in paperscraper/xrxiv/xrxiv_api.py

class XRXivApi:
    """API class."""

    def __init__(
        self,
        server: str,
        launch_date: str,
        api_base_url: str = "https://api.biorxiv.org",
        max_retries: int = 10,
    ):
        """
        Initialize API class.

        Args:
            server (str): name of the preprint server to access.
            launch_date (str): launch date expressed as YYYY-MM-DD.
            api_base_url (str, optional): Base url for the API. Defaults to 'api.biorxiv.org'.
            max_retries (int, optional): Maximal number of retries for a request before an
                error is raised. Defaults to 10.
        """
        self.server = server
        self.api_base_url = api_base_url
        self.launch_date = launch_date
        self.launch_datetime = datetime.fromisoformat(self.launch_date)
        self.get_papers_url = (
            "{}/details/{}".format(self.api_base_url, self.server)
            + "/{start_date}/{end_date}/{cursor}"
        )
        self.max_retries = max_retries

    @retry_multi()
    def call_api(self, start_date, end_date, cursor):
        try:
            json_response = requests.get(
                self.get_papers_url.format(
                    start_date=start_date, end_date=end_date, cursor=cursor
                ),
                timeout=10,
            ).json()
        except requests.exceptions.Timeout:
            logger.info("Timed out, will retry")
            return None

        return json_response

    def get_papers(
        self,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
        max_retries: int = 10,
    ) -> Generator:
        """
        Get paper metadata.

        Args:
            start_date (Optional[str]): begin date. Defaults to None, a.k.a. launch date.
            end_date (Optional[str]): end date. Defaults to None, a.k.a. today.
            fields (List[str], optional): fields to return per paper.
                Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
            max_retries (int): Number of retries on connection failure. Defaults to 10.

        Yields:
            Generator: a generator of paper metadata (dict) with the desired fields.
        """
        try:
            now_datetime = datetime.now()
            if start_date:
                start_datetime = datetime.fromisoformat(start_date)
                if start_datetime < self.launch_datetime:
                    start_date = self.launch_date
            else:
                start_date = self.launch_date
            if end_date:
                end_datetime = datetime.fromisoformat(end_date)
                if end_datetime > now_datetime:
                    end_date = now_datetime.strftime("%Y-%m-%d")
            else:
                end_date = now_datetime.strftime("%Y-%m-%d")
            do_loop = True
            cursor = 0
            while do_loop:
                papers = []
                for attempt in range(max_retries):
                    try:
                        json_response = self.call_api(start_date, end_date, cursor)
                        do_loop = json_response["messages"][0]["status"] == "ok"
                        if do_loop:
                            cursor += json_response["messages"][0]["count"]
                            for paper in json_response["collection"]:
                                processed_paper = {
                                    field: paper.get(field, "") for field in fields
                                }
                                papers.append(processed_paper)

                        if do_loop:
                            yield from papers
                            break
                    except (ConnectionError, Timeout) as e:
                        logger.error(
                            f"Connection error: {e}. Retrying ({attempt + 1}/{max_retries})"
                        )
                        sleep(5)
                        continue
                    except Exception as exc:
                        logger.exception(f"Failed getting papers: {exc}")
        except Exception as exc:
            logger.exception(f"Failed getting papers: {exc}")

`init(server: str, launch_date: str, api_base_url: str = 'https://api.biorxiv.org', max_retries: int = 10)` ¶

Initialize API class.

Parameters:

Name	Type	Description	Default
`server`	`str`	name of the preprint server to access.	required
`launch_date`	`str`	launch date expressed as YYYY-MM-DD.	required
`api_base_url`	`str`	Base url for the API. Defaults to 'api.biorxiv.org'.	`'https://api.biorxiv.org'`
`max_retries`	`int`	Maximal number of retries for a request before an error is raised. Defaults to 10.	`10`

Source code in paperscraper/xrxiv/xrxiv_api.py

def __init__(
    self,
    server: str,
    launch_date: str,
    api_base_url: str = "https://api.biorxiv.org",
    max_retries: int = 10,
):
    """
    Initialize API class.

    Args:
        server (str): name of the preprint server to access.
        launch_date (str): launch date expressed as YYYY-MM-DD.
        api_base_url (str, optional): Base url for the API. Defaults to 'api.biorxiv.org'.
        max_retries (int, optional): Maximal number of retries for a request before an
            error is raised. Defaults to 10.
    """
    self.server = server
    self.api_base_url = api_base_url
    self.launch_date = launch_date
    self.launch_datetime = datetime.fromisoformat(self.launch_date)
    self.get_papers_url = (
        "{}/details/{}".format(self.api_base_url, self.server)
        + "/{start_date}/{end_date}/{cursor}"
    )
    self.max_retries = max_retries

`get_papers(start_date: Optional[str] = None, end_date: Optional[str] = None, fields: List[str] = ['title', 'doi', 'authors', 'abstract', 'date', 'journal'], max_retries: int = 10) -> Generator` ¶

Get paper metadata.

Parameters:

Name	Type	Description	Default
`start_date`	`Optional[str]`	begin date. Defaults to None, a.k.a. launch date.	`None`
`end_date`	`Optional[str]`	end date. Defaults to None, a.k.a. today.	`None`
`fields`	`List[str]`	fields to return per paper. Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].	`['title', 'doi', 'authors', 'abstract', 'date', 'journal']`
`max_retries`	`int`	Number of retries on connection failure. Defaults to 10.	`10`

Yields:

Name	Type	Description
`Generator`	`Generator`	a generator of paper metadata (dict) with the desired fields.

Source code in paperscraper/xrxiv/xrxiv_api.py

def get_papers(
    self,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
    max_retries: int = 10,
) -> Generator:
    """
    Get paper metadata.

    Args:
        start_date (Optional[str]): begin date. Defaults to None, a.k.a. launch date.
        end_date (Optional[str]): end date. Defaults to None, a.k.a. today.
        fields (List[str], optional): fields to return per paper.
            Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
        max_retries (int): Number of retries on connection failure. Defaults to 10.

    Yields:
        Generator: a generator of paper metadata (dict) with the desired fields.
    """
    try:
        now_datetime = datetime.now()
        if start_date:
            start_datetime = datetime.fromisoformat(start_date)
            if start_datetime < self.launch_datetime:
                start_date = self.launch_date
        else:
            start_date = self.launch_date
        if end_date:
            end_datetime = datetime.fromisoformat(end_date)
            if end_datetime > now_datetime:
                end_date = now_datetime.strftime("%Y-%m-%d")
        else:
            end_date = now_datetime.strftime("%Y-%m-%d")
        do_loop = True
        cursor = 0
        while do_loop:
            papers = []
            for attempt in range(max_retries):
                try:
                    json_response = self.call_api(start_date, end_date, cursor)
                    do_loop = json_response["messages"][0]["status"] == "ok"
                    if do_loop:
                        cursor += json_response["messages"][0]["count"]
                        for paper in json_response["collection"]:
                            processed_paper = {
                                field: paper.get(field, "") for field in fields
                            }
                            papers.append(processed_paper)

                    if do_loop:
                        yield from papers
                        break
                except (ConnectionError, Timeout) as e:
                    logger.error(
                        f"Connection error: {e}. Retrying ({attempt + 1}/{max_retries})"
                    )
                    sleep(5)
                    continue
                except Exception as exc:
                    logger.exception(f"Failed getting papers: {exc}")
    except Exception as exc:
        logger.exception(f"Failed getting papers: {exc}")

`BioRxivApi` ¶

Bases: XRXivApi

bioRxiv API.

Source code in paperscraper/xrxiv/xrxiv_api.py

class BioRxivApi(XRXivApi):
    """bioRxiv API."""

    def __init__(self, max_retries: int = 10):
        super().__init__(
            server="biorxiv",
            launch_date=launch_dates["biorxiv"],
            max_retries=max_retries,
        )

`MedRxivApi` ¶

Bases: XRXivApi

medRxiv API.

Source code in paperscraper/xrxiv/xrxiv_api.py

class MedRxivApi(XRXivApi):
    """medRxiv API."""

    def __init__(self, max_retries: int = 10):
        super().__init__(
            server="medrxiv",
            launch_date=launch_dates["medrxiv"],
            max_retries=max_retries,
        )

`retry_multi()` ¶

Retry a function several times

Source code in paperscraper/xrxiv/xrxiv_api.py

def retry_multi():
    """Retry a function several times"""

    def decorator(func):
        @wraps(func)
        def wrapper(self, *args, **kwargs):
            num_retries = 0
            max_retries = getattr(self, "max_retries", 10)
            while num_retries <= max_retries:
                try:
                    ret = func(self, *args, **kwargs)
                    if ret is None:
                        time.sleep(5)
                        continue
                    break
                except HTTPError:
                    if num_retries == max_retries:
                        raise
                    num_retries += 1
                    time.sleep(5)
            return ret

        return wrapper

    return decorator

`xrxiv_query` ¶

Query dumps from bioRxiv and medRXiv.

`XRXivQuery` ¶

Query class.

Source code in paperscraper/xrxiv/xrxiv_query.py

class XRXivQuery:
    """Query class."""

    def __init__(
        self,
        dump_filepath: str,
        fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
    ):
        """
        Initialize the query class.

        Args:
            dump_filepath (str): filepath to the dump to be queried.
            fields (List[str], optional): fields to contained in the dump per paper.
                Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
        """
        self.dump_filepath = dump_filepath
        self.fields = fields
        self.errored = False

        try:
            self.df = pd.read_json(self.dump_filepath, lines=True)
            self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
        except ValueError as e:
            logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
            self.errored = True
        except KeyError as e:
            logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
            self.errored = True

    def search_keywords(
        self,
        keywords: List[Union[str, List[str]]],
        fields: List[str] = None,
        output_filepath: str = None,
    ) -> pd.DataFrame:
        """
        Search for papers in the dump using keywords.

        Args:
            keywords (List[str, List[str]]): Items will be AND separated. If items
                are lists themselves, they will be OR separated.
            fields (List[str], optional): fields to be used in the query search.
                Defaults to None, a.k.a. search in all fields excluding date.
            output_filepath (str, optional): optional output filepath where to store
                the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

        Returns:
            pd.DataFrame: A dataframe with one paper per row.
        """
        if fields is None:
            fields = self.fields
        fields = [field for field in fields if field != "date"]
        hits_per_field = []
        for field in fields:
            field_data = self.df[field].str.lower()
            hits_per_keyword = []
            for keyword in keywords:
                if isinstance(keyword, list):
                    query = "|".join([_.lower() for _ in keyword])
                else:
                    query = keyword.lower()
                hits_per_keyword.append(field_data.str.contains(query))
            if len(hits_per_keyword):
                keyword_hits = hits_per_keyword[0]
                for single_keyword_hits in hits_per_keyword[1:]:
                    keyword_hits &= single_keyword_hits
                hits_per_field.append(keyword_hits)
        if len(hits_per_field):
            hits = hits_per_field[0]
            for single_hits in hits_per_field[1:]:
                hits |= single_hits
        if output_filepath is not None:
            self.df[hits].to_json(output_filepath, orient="records", lines=True)
        return self.df[hits]

`init(dump_filepath: str, fields: List[str] = ['title', 'doi', 'authors', 'abstract', 'date', 'journal'])` ¶

Initialize the query class.

Parameters:

Name	Type	Description	Default
`dump_filepath`	`str`	filepath to the dump to be queried.	required
`fields`	`List[str]`	fields to contained in the dump per paper. Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].	`['title', 'doi', 'authors', 'abstract', 'date', 'journal']`

Source code in paperscraper/xrxiv/xrxiv_query.py

def __init__(
    self,
    dump_filepath: str,
    fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
):
    """
    Initialize the query class.

    Args:
        dump_filepath (str): filepath to the dump to be queried.
        fields (List[str], optional): fields to contained in the dump per paper.
            Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
    """
    self.dump_filepath = dump_filepath
    self.fields = fields
    self.errored = False

    try:
        self.df = pd.read_json(self.dump_filepath, lines=True)
        self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
    except ValueError as e:
        logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
        self.errored = True
    except KeyError as e:
        logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
        self.errored = True

`search_keywords(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame` ¶

Search for papers in the dump using keywords.

Parameters:

Name	Type	Description	Default
`keywords`	`List[str, List[str]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`fields`	`List[str]`	fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.	`None`
`output_filepath`	`str`	optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/xrxiv/xrxiv_query.py

def search_keywords(
    self,
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields (List[str], optional): fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath (str, optional): optional output filepath where to store
            the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    if fields is None:
        fields = self.fields
    fields = [field for field in fields if field != "date"]
    hits_per_field = []
    for field in fields:
        field_data = self.df[field].str.lower()
        hits_per_keyword = []
        for keyword in keywords:
            if isinstance(keyword, list):
                query = "|".join([_.lower() for _ in keyword])
            else:
                query = keyword.lower()
            hits_per_keyword.append(field_data.str.contains(query))
        if len(hits_per_keyword):
            keyword_hits = hits_per_keyword[0]
            for single_keyword_hits in hits_per_keyword[1:]:
                keyword_hits &= single_keyword_hits
            hits_per_field.append(keyword_hits)
    if len(hits_per_field):
        hits = hits_per_field[0]
        for single_hits in hits_per_field[1:]:
            hits |= single_hits
    if output_filepath is not None:
        self.df[hits].to_json(output_filepath, orient="records", lines=True)
    return self.df[hits]

paperscraper.xrxiv