paperscraper.arxiv

`paperscraper.arxiv` ¶

`XRXivQuery` ¶

Query class.

Source code in paperscraper/xrxiv/xrxiv_query.py

class XRXivQuery:
    """Query class."""

    def __init__(
        self,
        dump_filepath: str,
        fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
    ):
        """
        Initialize the query class.

        Args:
            dump_filepath (str): filepath to the dump to be queried.
            fields (List[str], optional): fields to contained in the dump per paper.
                Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
        """
        self.dump_filepath = dump_filepath
        self.fields = fields
        self.errored = False

        try:
            self.df = pd.read_json(self.dump_filepath, lines=True)
            self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
        except ValueError as e:
            logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
            self.errored = True
        except KeyError as e:
            logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
            self.errored = True

    def search_keywords(
        self,
        keywords: List[Union[str, List[str]]],
        fields: List[str] = None,
        output_filepath: str = None,
    ) -> pd.DataFrame:
        """
        Search for papers in the dump using keywords.

        Args:
            keywords (List[str, List[str]]): Items will be AND separated. If items
                are lists themselves, they will be OR separated.
            fields (List[str], optional): fields to be used in the query search.
                Defaults to None, a.k.a. search in all fields excluding date.
            output_filepath (str, optional): optional output filepath where to store
                the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

        Returns:
            pd.DataFrame: A dataframe with one paper per row.
        """
        if fields is None:
            fields = self.fields
        fields = [field for field in fields if field != "date"]
        hits_per_field = []
        for field in fields:
            field_data = self.df[field].str.lower()
            hits_per_keyword = []
            for keyword in keywords:
                if isinstance(keyword, list):
                    query = "|".join([_.lower() for _ in keyword])
                else:
                    query = keyword.lower()
                hits_per_keyword.append(field_data.str.contains(query))
            if len(hits_per_keyword):
                keyword_hits = hits_per_keyword[0]
                for single_keyword_hits in hits_per_keyword[1:]:
                    keyword_hits &= single_keyword_hits
                hits_per_field.append(keyword_hits)
        if len(hits_per_field):
            hits = hits_per_field[0]
            for single_hits in hits_per_field[1:]:
                hits |= single_hits
        if output_filepath is not None:
            self.df[hits].to_json(output_filepath, orient="records", lines=True)
        return self.df[hits]

`init(dump_filepath: str, fields: List[str] = ['title', 'doi', 'authors', 'abstract', 'date', 'journal'])` ¶

Initialize the query class.

Parameters:

Name	Type	Description	Default
`dump_filepath`	`str`	filepath to the dump to be queried.	required
`fields`	`List[str]`	fields to contained in the dump per paper. Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].	`['title', 'doi', 'authors', 'abstract', 'date', 'journal']`

Source code in paperscraper/xrxiv/xrxiv_query.py

def __init__(
    self,
    dump_filepath: str,
    fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
):
    """
    Initialize the query class.

    Args:
        dump_filepath (str): filepath to the dump to be queried.
        fields (List[str], optional): fields to contained in the dump per paper.
            Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
    """
    self.dump_filepath = dump_filepath
    self.fields = fields
    self.errored = False

    try:
        self.df = pd.read_json(self.dump_filepath, lines=True)
        self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
    except ValueError as e:
        logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
        self.errored = True
    except KeyError as e:
        logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
        self.errored = True

`search_keywords(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame` ¶

Search for papers in the dump using keywords.

Parameters:

Name	Type	Description	Default
`keywords`	`List[str, List[str]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`fields`	`List[str]`	fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.	`None`
`output_filepath`	`str`	optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/xrxiv/xrxiv_query.py

def search_keywords(
    self,
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields (List[str], optional): fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath (str, optional): optional output filepath where to store
            the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    if fields is None:
        fields = self.fields
    fields = [field for field in fields if field != "date"]
    hits_per_field = []
    for field in fields:
        field_data = self.df[field].str.lower()
        hits_per_keyword = []
        for keyword in keywords:
            if isinstance(keyword, list):
                query = "|".join([_.lower() for _ in keyword])
            else:
                query = keyword.lower()
            hits_per_keyword.append(field_data.str.contains(query))
        if len(hits_per_keyword):
            keyword_hits = hits_per_keyword[0]
            for single_keyword_hits in hits_per_keyword[1:]:
                keyword_hits &= single_keyword_hits
            hits_per_field.append(keyword_hits)
    if len(hits_per_field):
        hits = hits_per_field[0]
        for single_hits in hits_per_field[1:]:
            hits |= single_hits
    if output_filepath is not None:
        self.df[hits].to_json(output_filepath, orient="records", lines=True)
    return self.df[hits]

`dump_papers(papers: pd.DataFrame, filepath: str) -> None` ¶

Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl file with one paper per line.

Parameters:

Name	Type	Description	Default
`papers`	`DataFrame`	A dataframe of paper metadata, one paper per row.	required
`filepath`	`str`	Path to dump the papers, has to end with `.jsonl`.	required

Source code in paperscraper/utils.py

def dump_papers(papers: pd.DataFrame, filepath: str) -> None:
    """
    Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl
    file with one paper per line.

    Args:
        papers (pd.DataFrame): A dataframe of paper metadata, one paper per row.
        filepath (str): Path to dump the papers, has to end with `.jsonl`.
    """
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}")
    if not filepath.endswith(".jsonl"):
        raise ValueError("Please provide a filepath with .jsonl extension")

    if isinstance(papers, List) and all([isinstance(p, Dict) for p in papers]):
        papers = pd.DataFrame(papers)
        logger.warning(
            "Preferably pass a pd.DataFrame, not a list of dictionaries. "
            "Passing a list is a legacy functionality that might become deprecated."
        )

    if not isinstance(papers, pd.DataFrame):
        raise TypeError(f"papers must be a pd.DataFrame, not {type(papers)}")

    paper_list = list(papers.T.to_dict().values())

    with open(filepath, "w") as f:
        for paper in paper_list:
            f.write(json.dumps(paper) + "\n")

`get_server_dumps_dir() -> str` ¶

Return the filesystem path to the bundled server_dumps directory.

Source code in paperscraper/utils.py

def get_server_dumps_dir() -> str:
    """Return the filesystem path to the bundled server_dumps directory."""
    return str(resources.files("paperscraper").joinpath("server_dumps"))

`get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str` ¶

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name	Type	Description	Default
`keywords`	`List[str, List[str]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`

Returns:

Name	Type	Description
`str`	`str`	query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py

def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter

`get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame` ¶

Search for papers in the dump using keywords.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`fields`	`List[str]`	fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.	`None`
`output_filepath`	`str`	optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

`get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame` ¶

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name	Type	Description	Default
`query`	`str`	Query to arxiv API. Needs to match the arxiv API notation.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`max_results`	`int`	Maximal number of results, defaults to 99999.	`99999`
`client_options`	`Dict`	Optional arguments for `arxiv.Client`. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.	`{'num_retries': 10}`
`search_options`	`Dict`	Optional arguments for `arxiv.Search`. E.g.: id_list (List), sort_by, or sort_order.	`dict()`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

`get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args: object, **kwargs: object)` ¶

Combines get_arxiv_papers and dump_papers.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.	required
`output_filepath`	`str`	Path where the dump will be saved.	required
`fields`	`List`	List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`
`backend`	`Literal['api', 'local', 'infer']`	If `api`, the arXiv API is queried. If `local` the local arXiv dump is queried (has to be downloaded before). If `infer` the local dump will be used if exists, otherwise API will be queried. Defaults to `api` since it is faster.	`'api'`
`*args`	`object`	Additional positional arguments for `get_arxiv_papers`.	`()`
`**kwargs`	`object`	Additional keyword arguments for `get_arxiv_papers`.	`{}`

Source code in paperscraper/arxiv/arxiv.py

def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args: object,
    **kwargs: object,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args: Additional positional arguments for `get_arxiv_papers`.
        **kwargs: Additional keyword arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

`arxiv` ¶

`get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame` ¶

Search for papers in the dump using keywords.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`fields`	`List[str]`	fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.	`None`
`output_filepath`	`str`	optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

`get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame` ¶

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name	Type	Description	Default
`query`	`str`	Query to arxiv API. Needs to match the arxiv API notation.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`max_results`	`int`	Maximal number of results, defaults to 99999.	`99999`
`client_options`	`Dict`	Optional arguments for `arxiv.Client`. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.	`{'num_retries': 10}`
`search_options`	`Dict`	Optional arguments for `arxiv.Search`. E.g.: id_list (List), sort_by, or sort_order.	`dict()`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

`get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args: object, **kwargs: object)` ¶

Combines get_arxiv_papers and dump_papers.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.	required
`output_filepath`	`str`	Path where the dump will be saved.	required
`fields`	`List`	List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`
`backend`	`Literal['api', 'local', 'infer']`	If `api`, the arXiv API is queried. If `local` the local arXiv dump is queried (has to be downloaded before). If `infer` the local dump will be used if exists, otherwise API will be queried. Defaults to `api` since it is faster.	`'api'`
`*args`	`object`	Additional positional arguments for `get_arxiv_papers`.	`()`
`**kwargs`	`object`	Additional keyword arguments for `get_arxiv_papers`.	`{}`

Source code in paperscraper/arxiv/arxiv.py

def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args: object,
    **kwargs: object,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args: Additional positional arguments for `get_arxiv_papers`.
        **kwargs: Additional keyword arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

`kaggle` ¶

Kaggle-backed arXiv metadata dumping utilities.

`arxiv_kaggle(start_date: datetime, end_date: datetime, save_path: str, kaggle_filepath: Optional[str] = None) -> int` ¶

Convert a Kaggle arXiv metadata snapshot to paperscraper JSONL format.

Parameters:

Name	Type	Description	Default
`start_date`	`datetime`	Earliest paper submission date to include.	required
`end_date`	`datetime`	Latest paper submission date to include.	required
`save_path`	`str`	Destination JSONL path for converted papers.	required
`kaggle_filepath`	`Optional[str]`	Existing Kaggle snapshot file. If provided, no Kaggle download is attempted.	`None`

Returns:

Type	Description
`int`	Number of papers written to `save_path`.

Source code in paperscraper/arxiv/kaggle.py

def arxiv_kaggle(
    start_date: datetime,
    end_date: datetime,
    save_path: str,
    kaggle_filepath: Optional[str] = None,
) -> int:
    """Convert a Kaggle arXiv metadata snapshot to paperscraper JSONL format.

    Args:
        start_date: Earliest paper submission date to include.
        end_date: Latest paper submission date to include.
        save_path: Destination JSONL path for converted papers.
        kaggle_filepath: Existing Kaggle snapshot file. If provided, no Kaggle
            download is attempted.

    Returns:
        Number of papers written to `save_path`.
    """
    cleanup_dir = default_kaggle_dir() if kaggle_filepath is None else None
    if kaggle_filepath is None:
        kaggle_filepath = download_kaggle_snapshot()

    try:
        written = 0
        os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
        total_size = os.path.getsize(kaggle_filepath)
        with (
            open(kaggle_filepath, "r", encoding="utf-8") as in_fp,
            open(save_path, "w", encoding="utf-8") as out_fp,
            tqdm(
                total=total_size,
                desc="Converting arXiv Kaggle snapshot",
                unit="B",
                unit_scale=True,
            ) as progress_bar,
        ):
            for line in in_fp:
                progress_bar.update(len(line.encode("utf-8")))
                if not line.strip():
                    continue

                record = json.loads(line)
                paper_date = get_kaggle_paper_date(record)
                if paper_date is None or not start_date <= paper_date <= end_date:
                    continue

                if written > 0:
                    out_fp.write(os.linesep)
                out_fp.write(json.dumps(normalize_kaggle_record(record, paper_date)))
                written += 1
        return written
    finally:
        if cleanup_dir is not None:
            shutil.rmtree(cleanup_dir, ignore_errors=True)

`download_kaggle_snapshot() -> str` ¶

Download the Kaggle arXiv metadata snapshot if needed.

Returns:

Type	Description
`str`	Path to the local Kaggle snapshot JSON file.

Raises:

Type	Description
`ImportError`	If the `kaggle` package is not installed.
`RuntimeError`	If Kaggle authentication is missing or invalid.
`FileNotFoundError`	If the download succeeds but no snapshot JSON is found.

Source code in paperscraper/arxiv/kaggle.py

def download_kaggle_snapshot() -> str:
    """Download the Kaggle arXiv metadata snapshot if needed.

    Returns:
        Path to the local Kaggle snapshot JSON file.

    Raises:
        ImportError: If the `kaggle` package is not installed.
        RuntimeError: If Kaggle authentication is missing or invalid.
        FileNotFoundError: If the download succeeds but no snapshot JSON is found.
    """
    kaggle_dir = default_kaggle_dir()
    os.makedirs(kaggle_dir, exist_ok=True)

    if existing_snapshot := find_kaggle_snapshot(kaggle_dir):
        return existing_snapshot

    try:
        from kaggle.api.kaggle_api_extended import KaggleApi
    except ImportError as exc:
        raise ImportError(
            "The Kaggle backend requires the `kaggle` package. Install it with "
            "`pip install kaggle` or `uv add kaggle`."
        ) from exc
    except SystemExit as exc:
        raise RuntimeError(
            "Kaggle authentication is required for the arXiv Kaggle backend. "
            "Run `kaggle auth login` or configure Kaggle credentials."
        ) from exc

    api = KaggleApi()
    try:
        api.authenticate()
    except Exception as exc:
        raise RuntimeError(
            "Kaggle authentication is required for the arXiv Kaggle backend. "
            "Run `kaggle auth login` or configure Kaggle credentials."
        ) from exc
    api.dataset_download_files(
        DEFAULT_KAGGLE_DATASET,
        path=kaggle_dir,
        unzip=True,
        quiet=False,
    )

    snapshot = find_kaggle_snapshot(kaggle_dir)
    if snapshot is None:
        raise FileNotFoundError(f"No arXiv Kaggle snapshot found in {kaggle_dir}")
    return snapshot

`default_kaggle_dir() -> str` ¶

Return the default temporary directory for Kaggle arXiv downloads.

Returns:

Type	Description
`str`	Path to the default Kaggle download directory.

Source code in paperscraper/arxiv/kaggle.py

def default_kaggle_dir() -> str:
    """Return the default temporary directory for Kaggle arXiv downloads.

    Returns:
        Path to the default Kaggle download directory.
    """
    return os.path.join(get_server_dumps_dir(), "arxiv_kaggle")

`find_kaggle_snapshot(kaggle_dir: str) -> Optional[str]` ¶

Find the arXiv metadata snapshot JSON in a Kaggle download directory.

Parameters:

Name	Type	Description	Default
`kaggle_dir`	`str`	Directory to search.	required

Returns:

Type	Description
`Optional[str]`	Path to the largest candidate JSON file, or None if no candidate exists.

Source code in paperscraper/arxiv/kaggle.py

def find_kaggle_snapshot(kaggle_dir: str) -> Optional[str]:
    """Find the arXiv metadata snapshot JSON in a Kaggle download directory.

    Args:
        kaggle_dir: Directory to search.

    Returns:
        Path to the largest candidate JSON file, or None if no candidate exists.
    """
    candidates = [
        *glob.glob(os.path.join(kaggle_dir, "arxiv-metadata*.json")),
        *glob.glob(os.path.join(kaggle_dir, "*.json")),
    ]
    candidates = [path for path in candidates if os.path.isfile(path)]
    if not candidates:
        return None
    return sorted(candidates, key=os.path.getsize, reverse=True)[0]

`get_kaggle_paper_date(record: dict) -> Optional[datetime]` ¶

Extract the first submission date from a Kaggle arXiv record.

Parameters:

Name	Type	Description	Default
`record`	`dict`	Raw Kaggle arXiv metadata record.	required

Returns:

Type	Description
`Optional[datetime]`	Naive UTC-normalized submission date at midnight, or None if no usable
`Optional[datetime]`	date is available.

Source code in paperscraper/arxiv/kaggle.py

def get_kaggle_paper_date(record: dict) -> Optional[datetime]:
    """Extract the first submission date from a Kaggle arXiv record.

    Args:
        record: Raw Kaggle arXiv metadata record.

    Returns:
        Naive UTC-normalized submission date at midnight, or None if no usable
        date is available.
    """
    created = next(
        (version.get("created") for version in record.get("versions", []) if version),
        None,
    )
    if created:
        try:
            date = parsedate_to_datetime(created)
            if date.tzinfo is not None:
                date = date.astimezone(timezone.utc).replace(tzinfo=None)
            return date.replace(hour=0, minute=0, second=0, microsecond=0)
        except (TypeError, ValueError):
            pass

    update_date = record.get("update_date")
    if update_date:
        try:
            return datetime.strptime(update_date, "%Y-%m-%d")
        except ValueError:
            return None
    return None

`normalize_kaggle_record(record: dict, paper_date: datetime) -> dict` ¶

Normalize a Kaggle arXiv record to paperscraper dump fields.

Parameters:

Name	Type	Description	Default
`record`	`dict`	Raw Kaggle arXiv metadata record.	required
`paper_date`	`datetime`	Submission date returned by `get_kaggle_paper_date`.	required

Returns:

Type	Description
`dict`	Dictionary with paperscraper's standard `title`, `authors`, `date`,
`dict`	`abstract`, `journal`, and `doi` fields.

Source code in paperscraper/arxiv/kaggle.py

def normalize_kaggle_record(record: dict, paper_date: datetime) -> dict:
    """Normalize a Kaggle arXiv record to paperscraper dump fields.

    Args:
        record: Raw Kaggle arXiv metadata record.
        paper_date: Submission date returned by `get_kaggle_paper_date`.

    Returns:
        Dictionary with paperscraper's standard `title`, `authors`, `date`,
        `abstract`, `journal`, and `doi` fields.
    """
    arxiv_id = str(record.get("id", "")).split("v")[0]
    return {
        "title": normalize_whitespace(record.get("title", "")),
        "authors": normalize_whitespace(record.get("authors", "")),
        "date": paper_date.strftime("%Y-%m-%d"),
        "abstract": normalize_whitespace(record.get("abstract", "")),
        "journal": normalize_whitespace(record.get("journal-ref", "")),
        "doi": record.get("doi") or f"10.48550/arXiv.{arxiv_id}",
    }

`utils` ¶

`format_date(date_str: str) -> str` ¶

Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format.

Source code in paperscraper/arxiv/utils.py

def format_date(date_str: str) -> str:
    """Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format."""
    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
    return date_obj.strftime("%Y%m%d0000")

`get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str` ¶

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name	Type	Description	Default
`keywords`	`List[str, List[str]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`

Returns:

Name	Type	Description
`str`	`str`	query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py

def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter

paperscraper.arxiv