Skip to content

paperscraper.get_dumps

paperscraper.get_dumps

arxiv

Dump arxiv data in JSONL format.

arxiv(start_date: Optional[str] = None, end_date: Optional[str] = None, save_path: str = save_path)

Fetches papers from arXiv based on time range, i.e., start_date and end_date. If the start_date and end_date are not provided, fetches papers from the earliest possible date to the current date. The fetched papers are stored in JSONL format.

Parameters:

Name Type Description Default
start_date str

Start date in format YYYY-MM-DD. Defaults to None.

None
end_date str

End date in format YYYY-MM-DD. Defaults to None.

None
save_path str

Path to save the JSONL dump. Defaults to save_path.

save_path
Source code in paperscraper/get_dumps/arxiv.py
def arxiv(
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    save_path: str = save_path,
):
    """
    Fetches papers from arXiv based on time range, i.e., start_date and end_date.
    If the start_date and end_date are not provided, fetches papers from the earliest
    possible date to the current date. The fetched papers are stored in JSONL format.

    Args:
        start_date (str, optional): Start date in format YYYY-MM-DD. Defaults to None.
        end_date (str, optional): End date in format YYYY-MM-DD. Defaults to None.
        save_path (str, optional): Path to save the JSONL dump. Defaults to save_path.
    """
    # Set default dates
    EARLIEST_START = "1991-01-01"
    if start_date is None:
        start_date = EARLIEST_START
    if end_date is None:
        end_date = datetime.today().strftime("%Y-%m-%d")

    # Convert dates to datetime objects
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    if start_date > end_date:
        raise ValueError(
            f"start_date {start_date} cannot be later than end_date {end_date}"
        )

    # Open file for writing results
    with open(save_path, "w") as fp:
        progress_bar = tqdm(total=(end_date - start_date).days + 1)

        current_date = start_date
        while current_date <= end_date:
            next_date = current_date + timedelta(days=1)
            progress_bar.set_description(
                f"Fetching {current_date.strftime('%Y-%m-%d')}"
            )

            # Format dates for query
            query = f"submittedDate:[{current_date.strftime('%Y%m%d0000')} TO {next_date.strftime('%Y%m%d0000')}]"
            try:
                papers = get_arxiv_papers_api(
                    query=query,
                    fields=["title", "authors", "date", "abstract", "journal", "doi"],
                    verbose=False,
                )
                if not papers.empty:
                    for paper in papers.to_dict(orient="records"):
                        fp.write(json.dumps(paper) + "\n")
            except Exception as e:
                print(f"Arxiv scraping error: {current_date.strftime('%Y-%m-%d')}: {e}")
            current_date = next_date
            progress_bar.update(1)

biorxiv

Dump bioRxiv data in JSONL format.

biorxiv(start_date: Optional[str] = None, end_date: Optional[str] = None, save_path: str = save_path, max_retries: int = 10)

Fetches papers from biorxiv based on time range, i.e., start_date and end_date. If the start_date and end_date are not provided, papers will be fetched from biorxiv from the launch date of biorxiv until the current date. The fetched papers will be stored in jsonl format in save_path.

Parameters:

Name Type Description Default
start_date str

begin date expressed as YYYY-MM-DD. Defaults to None, i.e., earliest possible.

None
end_date str

end date expressed as YYYY-MM-DD. Defaults to None, i.e., today.

None
save_path str

Path where the dump is stored. Defaults to save_path.

save_path
max_retries int

Number of retries when API shows connection issues. Defaults to 10.

10
Source code in paperscraper/get_dumps/biorxiv.py
def biorxiv(
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    save_path: str = save_path,
    max_retries: int = 10,
):
    """Fetches papers from biorxiv based on time range, i.e., start_date and end_date.
    If the start_date and end_date are not provided, papers will be fetched from biorxiv
    from the launch date of biorxiv until the current date. The fetched papers will be
    stored in jsonl format in save_path.

    Args:
        start_date (str, optional): begin date expressed as YYYY-MM-DD.
            Defaults to None, i.e., earliest possible.
        end_date (str, optional): end date expressed as YYYY-MM-DD.
            Defaults to None, i.e., today.
        save_path (str, optional): Path where the dump is stored.
            Defaults to save_path.
        max_retries (int, optional): Number of retries when API shows connection issues.
            Defaults to 10.
    """
    # create API client
    api = BioRxivApi(max_retries=max_retries)

    # dump all papers
    with open(save_path, "w") as fp:
        for index, paper in enumerate(
            tqdm(api.get_papers(start_date=start_date, end_date=end_date))
        ):
            if index > 0:
                fp.write(os.linesep)
            fp.write(json.dumps(paper))

chemrxiv

Dump chemRxiv data in JSONL format.

chemrxiv(start_date: Optional[str] = None, end_date: Optional[str] = None, save_path: str = save_path) -> None

Fetches papers from bichemrxiv based on time range, i.e., start_date and end_date. If the start_date and end_date are not provided, papers will be fetched from chemrxiv from the launch date of chemrxiv until the current date. The fetched papers will be stored in jsonl format in save_path.

Parameters:

Name Type Description Default
start_date str

begin date expressed as YYYY-MM-DD. Defaults to None, i.e., earliest possible.

None
end_date str

end date expressed as YYYY-MM-DD. Defaults to None, i.e., today.

None
save_path str

Path where the dump is stored. Defaults to save_path.

save_path
Source code in paperscraper/get_dumps/chemrxiv.py
def chemrxiv(
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    save_path: str = save_path,
) -> None:
    """Fetches papers from bichemrxiv based on time range, i.e., start_date and end_date.
    If the start_date and end_date are not provided, papers will be fetched from chemrxiv
    from the launch date of chemrxiv until the current date. The fetched papers will be
    stored in jsonl format in save_path.

    Args:
        start_date (str, optional): begin date expressed as YYYY-MM-DD.
            Defaults to None, i.e., earliest possible.
        end_date (str, optional): end date expressed as YYYY-MM-DD.
            Defaults to None, i.e., today.
        save_path (str, optional): Path where the dump is stored.
            Defaults to save_path.
    """

    # create API client
    api = ChemrxivAPI(start_date, end_date)
    # Download the data
    download_full(save_folder, api)
    # Convert to JSONL format.
    parse_dump(save_folder, save_path)

medrxiv

Dump medrxiv data in JSONL format.

medrxiv(start_date: Optional[str] = None, end_date: Optional[str] = None, save_path: str = save_path, max_retries: int = 10)

Fetches papers from medrxiv based on time range, i.e., start_date and end_date. If the start_date and end_date are not provided, then papers will be fetched from medrxiv starting from the launch date of medrxiv until current date. The fetched papers will be stored in jsonl format in save_path.

Parameters:

Name Type Description Default
start_date str

begin date expressed as YYYY-MM-DD. Defaults to None, i.e., earliest possible.

None
end_date str

end date expressed as YYYY-MM-DD. Defaults to None, i.e., today.

None
save_path str

Path where the dump is stored. Defaults to save_path.

save_path
max_retries int

Number of retries when API shows connection issues. Defaults to 10.

10
Source code in paperscraper/get_dumps/medrxiv.py
def medrxiv(
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    save_path: str = save_path,
    max_retries: int = 10,
):
    """Fetches papers from medrxiv based on time range, i.e., start_date and end_date.
    If the start_date and end_date are not provided, then papers will be fetched from
    medrxiv starting from the launch date of medrxiv until current date. The fetched
    papers will be stored in jsonl format in save_path.

    Args:
        start_date (str, optional): begin date expressed as YYYY-MM-DD.
            Defaults to None, i.e., earliest possible.
        end_date (str, optional): end date expressed as YYYY-MM-DD.
            Defaults to None, i.e., today.
        save_path (str, optional): Path where the dump is stored.
            Defaults to save_path.
        max_retries (int, optional): Number of retries when API shows connection issues.
            Defaults to 10.
    """
    # create API client
    api = MedRxivApi(max_retries=max_retries)
    # dump all papers
    with open(save_path, "w") as fp:
        for index, paper in enumerate(
            tqdm(api.get_papers(start_date=start_date, end_date=end_date))
        ):
            if index > 0:
                fp.write(os.linesep)
            fp.write(json.dumps(paper))

utils

chemrxiv

get_author(author_list: List[Dict]) -> str

Parse ChemRxiv dump entry to extract author list

Parameters:

Name Type Description Default
author_list list

List of dicts, one per author.

required

Returns:

Name Type Description
str str

;-concatenated author list.

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_author(author_list: List[Dict]) -> str:
    """Parse ChemRxiv dump entry to extract author list

    Args:
        author_list (list): List of dicts, one per author.

    Returns:
        str: ;-concatenated author list.
    """

    return "; ".join([" ".join([a["firstName"], a["lastName"]]) for a in author_list])
get_categories(category_list: List[Dict]) -> str

Parse ChemRxiv dump entry to extract the categories of the paper

Parameters:

Name Type Description Default
category_list list

List of dicts, one per category.

required

Returns:

Name Type Description
str str

;-concatenated category list.

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_categories(category_list: List[Dict]) -> str:
    """Parse ChemRxiv dump entry to extract the categories of the paper

    Args:
        category_list (list): List of dicts, one per category.

    Returns:
        str: ;-concatenated category list.
    """

    return "; ".join([a["name"] for a in category_list])
get_date(datestring: str) -> str

Get the date of a chemrxiv dump enry.

Parameters:

Name Type Description Default
datestring str

String in the format: 2021-10-15T05:12:32.356Z

required

Returns:

Name Type Description
str str

Date in the format: YYYY-MM-DD.

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_date(datestring: str) -> str:
    """Get the date of a chemrxiv dump enry.

    Args:
        datestring: String in the format: 2021-10-15T05:12:32.356Z

    Returns:
        str: Date in the format: YYYY-MM-DD.
    """
    return datestring.split("T")[0]
get_metrics(metrics_list: List[Dict]) -> Dict

Parse ChemRxiv dump entry to extract the access metrics of the paper.

Parameters:

Name Type Description Default
metrics_list List[Dict]

A list of single-keyed, dictionaries each containing key and value for exactly one metric.

required

Returns:

Name Type Description
Dict Dict

A flattened dictionary with all metrics and a timestamp

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_metrics(metrics_list: List[Dict]) -> Dict:
    """
    Parse ChemRxiv dump entry to extract the access metrics of the paper.

    Args:
        metrics_list (List[Dict]): A list of single-keyed, dictionaries each
            containing key and value for exactly one metric.

    Returns:
        Dict: A flattened dictionary with all metrics and a timestamp
    """
    metric_dict = {m["description"]: m["value"] for m in metrics_list}

    # This assumes that the .jsonl is constructed at roughly the same date
    # where this entry was obtained from the API
    metric_dict.update({"timestamp": today})
parse_dump(source_path: str, target_path: str) -> None

Parses the dump as generated by the chemrXiv API and this repo: https://github.com/cthoyt/chemrxiv-summarize into a format that is equal to that of biorXiv and medRxiv.

NOTE: This is a lazy parser trying to store all data in memory.

Parameters:

Name Type Description Default
source_path str

Path to the source dump

required
Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def parse_dump(source_path: str, target_path: str) -> None:
    """
    Parses the dump as generated by the chemrXiv API and this repo:
    https://github.com/cthoyt/chemrxiv-summarize
    into a format that is equal to that of biorXiv and medRxiv.

    NOTE: This is a lazy parser trying to store all data in memory.

    Args:
        source_path: Path to the source dump
    """

    dump = []
    # Read source dump
    for file_name in tqdm(os.listdir(source_path)):
        if not file_name.endswith(".json"):
            continue
        filepath = os.path.join(source_path, file_name)
        with open(filepath, "r") as f:
            source_paper = json.load(f)

        target_paper = {
            "title": source_paper["title"],
            "doi": source_paper["doi"],
            "published_doi": (
                source_paper["vor"]["vorDoi"] if source_paper["vor"] else "N.A."
            ),
            "published_url": (
                source_paper["vor"]["url"] if source_paper["vor"] else "N.A."
            ),
            "authors": get_author(source_paper["authors"]),
            "abstract": source_paper["abstract"],
            "date": get_date(source_paper["statusDate"]),
            "journal": "chemRxiv",
            "categories": get_categories(source_paper["categories"]),
            "metrics": get_metrics(source_paper["metrics"]),
            "license": source_paper["license"]["name"],
        }
        dump.append(target_paper)
        os.remove(filepath)
    # Write dump
    with open(target_path, "w") as f:
        for idx, target_paper in enumerate(dump):
            if idx > 0:
                f.write(os.linesep)
            f.write(json.dumps(target_paper))
    logger.info("Done, shutting down")
chemrxiv_api
ChemrxivAPI

Handle OpenEngage API requests, using access. Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.

Source code in paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
class ChemrxivAPI:
    """Handle OpenEngage API requests, using access.
    Adapted from https://github.com/fxcoudert/tools/blob/master/chemRxiv/chemRxiv.py.
    """

    base = "https://chemrxiv.org/engage/chemrxiv/public-api/v1/"

    def __init__(
        self,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        page_size: Optional[int] = None,
        max_retries: int = 10,
    ):
        """
        Initialize API class.

        Args:
            start_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
                Defaults to None.
            end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
                Defaults to None.
            page_size (int, optional): The batch size used to fetch the records from chemrxiv.
            max_retries (int): Number of retries in case of error
        """

        self.page_size = page_size or 50
        self.max_retries = max_retries

        # Begin Date and End Date of the search
        launch_date = launch_dates["chemrxiv"]
        launch_datetime = datetime.fromisoformat(launch_date)

        if start_date:
            start_datetime = datetime.fromisoformat(start_date)
            if start_datetime < launch_datetime:
                self.start_date = launch_date
                logger.warning(
                    f"Begin date {start_date} is before chemrxiv launch date. Will use {launch_date} instead."
                )
            else:
                self.start_date = start_date
        else:
            self.start_date = launch_date
        if end_date:
            end_datetime = datetime.fromisoformat(end_date)
            if end_datetime > now_datetime:
                logger.warning(
                    f"End date {end_date} is in the future. Will use {now_datetime} instead."
                )
                self.end_date = now_datetime.strftime("%Y-%m-%d")
            else:
                self.end_date = end_date
        else:
            self.end_date = now_datetime.strftime("%Y-%m-%d")

    def request(self, url, method, params=None):
        """Send an API request to open Engage."""

        for attempt in range(self.max_retries):
            try:
                if method.casefold() == "get":
                    return requests.get(url, params=params, timeout=10)
                elif method.casefold() == "post":
                    return requests.post(url, json=params, timeout=10)
                else:
                    raise ConnectionError(f"Unknown method for query: {method}")
            except ChunkedEncodingError as e:
                logger.warning(f"ChunkedEncodingError occurred for {url}: {e}")
                if attempt + 1 == self.max_retries:
                    raise e
                time.sleep(3)

    def query(self, query, method="get", params=None):
        """Perform a direct query."""

        r = self.request(urljoin(self.base, query), method, params=params)
        r.raise_for_status()
        return r.json()

    def query_generator(self, query, method: str = "get", params: Dict = {}):
        """Query for a list of items, with paging. Returns a generator."""

        try:
            total = self.number_of_preprints()
        except Exception:
            total = float("inf")   # fallback if that call fails

        page = 0
        while True:
            params.update(
                {
                    "limit": self.page_size,
                    "skip": page * self.page_size,
                    "searchDateFrom": self.start_date,
                    "searchDateTo": self.end_date,
                }
            )
            if page * self.page_size > total:
                break
            r = self.request(urljoin(self.base, query), method, params=params)
            if r.status_code == 400:
                raise ValueError(r.json()["message"])
            r.raise_for_status()
            r = r.json()
            r = r["itemHits"]

            # If we have no more results, bail out
            if len(r) == 0:
                return

            yield from r
            page += 1

    def all_preprints(self):
        """Return a generator to all the chemRxiv articles."""
        return self.query_generator("items")

    def preprint(self, article_id):
        """Information on a given preprint.
        .. seealso:: https://docs.figshare.com/#public_article
        """
        return self.query(os.path.join("items", article_id))

    def number_of_preprints(self):
        return self.query("items")["totalCount"]
__init__(start_date: Optional[str] = None, end_date: Optional[str] = None, page_size: Optional[int] = None, max_retries: int = 10)

Initialize API class.

Parameters:

Name Type Description Default
start_date Optional[str]

begin date expressed as YYYY-MM-DD. Defaults to None.

None
end_date Optional[str]

end date expressed as YYYY-MM-DD. Defaults to None.

None
page_size int

The batch size used to fetch the records from chemrxiv.

None
max_retries int

Number of retries in case of error

10
Source code in paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
def __init__(
    self,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    page_size: Optional[int] = None,
    max_retries: int = 10,
):
    """
    Initialize API class.

    Args:
        start_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
            Defaults to None.
        end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
            Defaults to None.
        page_size (int, optional): The batch size used to fetch the records from chemrxiv.
        max_retries (int): Number of retries in case of error
    """

    self.page_size = page_size or 50
    self.max_retries = max_retries

    # Begin Date and End Date of the search
    launch_date = launch_dates["chemrxiv"]
    launch_datetime = datetime.fromisoformat(launch_date)

    if start_date:
        start_datetime = datetime.fromisoformat(start_date)
        if start_datetime < launch_datetime:
            self.start_date = launch_date
            logger.warning(
                f"Begin date {start_date} is before chemrxiv launch date. Will use {launch_date} instead."
            )
        else:
            self.start_date = start_date
    else:
        self.start_date = launch_date
    if end_date:
        end_datetime = datetime.fromisoformat(end_date)
        if end_datetime > now_datetime:
            logger.warning(
                f"End date {end_date} is in the future. Will use {now_datetime} instead."
            )
            self.end_date = now_datetime.strftime("%Y-%m-%d")
        else:
            self.end_date = end_date
    else:
        self.end_date = now_datetime.strftime("%Y-%m-%d")
request(url, method, params=None)

Send an API request to open Engage.

Source code in paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
def request(self, url, method, params=None):
    """Send an API request to open Engage."""

    for attempt in range(self.max_retries):
        try:
            if method.casefold() == "get":
                return requests.get(url, params=params, timeout=10)
            elif method.casefold() == "post":
                return requests.post(url, json=params, timeout=10)
            else:
                raise ConnectionError(f"Unknown method for query: {method}")
        except ChunkedEncodingError as e:
            logger.warning(f"ChunkedEncodingError occurred for {url}: {e}")
            if attempt + 1 == self.max_retries:
                raise e
            time.sleep(3)
query(query, method='get', params=None)

Perform a direct query.

Source code in paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
def query(self, query, method="get", params=None):
    """Perform a direct query."""

    r = self.request(urljoin(self.base, query), method, params=params)
    r.raise_for_status()
    return r.json()
query_generator(query, method: str = 'get', params: Dict = {})

Query for a list of items, with paging. Returns a generator.

Source code in paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
def query_generator(self, query, method: str = "get", params: Dict = {}):
    """Query for a list of items, with paging. Returns a generator."""

    try:
        total = self.number_of_preprints()
    except Exception:
        total = float("inf")   # fallback if that call fails

    page = 0
    while True:
        params.update(
            {
                "limit": self.page_size,
                "skip": page * self.page_size,
                "searchDateFrom": self.start_date,
                "searchDateTo": self.end_date,
            }
        )
        if page * self.page_size > total:
            break
        r = self.request(urljoin(self.base, query), method, params=params)
        if r.status_code == 400:
            raise ValueError(r.json()["message"])
        r.raise_for_status()
        r = r.json()
        r = r["itemHits"]

        # If we have no more results, bail out
        if len(r) == 0:
            return

        yield from r
        page += 1
all_preprints()

Return a generator to all the chemRxiv articles.

Source code in paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
def all_preprints(self):
    """Return a generator to all the chemRxiv articles."""
    return self.query_generator("items")
preprint(article_id)

Information on a given preprint. .. seealso:: https://docs.figshare.com/#public_article

Source code in paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
def preprint(self, article_id):
    """Information on a given preprint.
    .. seealso:: https://docs.figshare.com/#public_article
    """
    return self.query(os.path.join("items", article_id))
utils

Misc utils to download chemRxiv dump

get_author(author_list: List[Dict]) -> str

Parse ChemRxiv dump entry to extract author list

Parameters:

Name Type Description Default
author_list list

List of dicts, one per author.

required

Returns:

Name Type Description
str str

;-concatenated author list.

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_author(author_list: List[Dict]) -> str:
    """Parse ChemRxiv dump entry to extract author list

    Args:
        author_list (list): List of dicts, one per author.

    Returns:
        str: ;-concatenated author list.
    """

    return "; ".join([" ".join([a["firstName"], a["lastName"]]) for a in author_list])
get_categories(category_list: List[Dict]) -> str

Parse ChemRxiv dump entry to extract the categories of the paper

Parameters:

Name Type Description Default
category_list list

List of dicts, one per category.

required

Returns:

Name Type Description
str str

;-concatenated category list.

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_categories(category_list: List[Dict]) -> str:
    """Parse ChemRxiv dump entry to extract the categories of the paper

    Args:
        category_list (list): List of dicts, one per category.

    Returns:
        str: ;-concatenated category list.
    """

    return "; ".join([a["name"] for a in category_list])
get_date(datestring: str) -> str

Get the date of a chemrxiv dump enry.

Parameters:

Name Type Description Default
datestring str

String in the format: 2021-10-15T05:12:32.356Z

required

Returns:

Name Type Description
str str

Date in the format: YYYY-MM-DD.

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_date(datestring: str) -> str:
    """Get the date of a chemrxiv dump enry.

    Args:
        datestring: String in the format: 2021-10-15T05:12:32.356Z

    Returns:
        str: Date in the format: YYYY-MM-DD.
    """
    return datestring.split("T")[0]
get_metrics(metrics_list: List[Dict]) -> Dict

Parse ChemRxiv dump entry to extract the access metrics of the paper.

Parameters:

Name Type Description Default
metrics_list List[Dict]

A list of single-keyed, dictionaries each containing key and value for exactly one metric.

required

Returns:

Name Type Description
Dict Dict

A flattened dictionary with all metrics and a timestamp

Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def get_metrics(metrics_list: List[Dict]) -> Dict:
    """
    Parse ChemRxiv dump entry to extract the access metrics of the paper.

    Args:
        metrics_list (List[Dict]): A list of single-keyed, dictionaries each
            containing key and value for exactly one metric.

    Returns:
        Dict: A flattened dictionary with all metrics and a timestamp
    """
    metric_dict = {m["description"]: m["value"] for m in metrics_list}

    # This assumes that the .jsonl is constructed at roughly the same date
    # where this entry was obtained from the API
    metric_dict.update({"timestamp": today})
parse_dump(source_path: str, target_path: str) -> None

Parses the dump as generated by the chemrXiv API and this repo: https://github.com/cthoyt/chemrxiv-summarize into a format that is equal to that of biorXiv and medRxiv.

NOTE: This is a lazy parser trying to store all data in memory.

Parameters:

Name Type Description Default
source_path str

Path to the source dump

required
Source code in paperscraper/get_dumps/utils/chemrxiv/utils.py
def parse_dump(source_path: str, target_path: str) -> None:
    """
    Parses the dump as generated by the chemrXiv API and this repo:
    https://github.com/cthoyt/chemrxiv-summarize
    into a format that is equal to that of biorXiv and medRxiv.

    NOTE: This is a lazy parser trying to store all data in memory.

    Args:
        source_path: Path to the source dump
    """

    dump = []
    # Read source dump
    for file_name in tqdm(os.listdir(source_path)):
        if not file_name.endswith(".json"):
            continue
        filepath = os.path.join(source_path, file_name)
        with open(filepath, "r") as f:
            source_paper = json.load(f)

        target_paper = {
            "title": source_paper["title"],
            "doi": source_paper["doi"],
            "published_doi": (
                source_paper["vor"]["vorDoi"] if source_paper["vor"] else "N.A."
            ),
            "published_url": (
                source_paper["vor"]["url"] if source_paper["vor"] else "N.A."
            ),
            "authors": get_author(source_paper["authors"]),
            "abstract": source_paper["abstract"],
            "date": get_date(source_paper["statusDate"]),
            "journal": "chemRxiv",
            "categories": get_categories(source_paper["categories"]),
            "metrics": get_metrics(source_paper["metrics"]),
            "license": source_paper["license"]["name"],
        }
        dump.append(target_paper)
        os.remove(filepath)
    # Write dump
    with open(target_path, "w") as f:
        for idx, target_paper in enumerate(dump):
            if idx > 0:
                f.write(os.linesep)
            f.write(json.dumps(target_paper))
    logger.info("Done, shutting down")