Skip to content

paperscraper.arxiv

paperscraper.arxiv

XRXivQuery

Query class.

Source code in paperscraper/xrxiv/xrxiv_query.py
class XRXivQuery:
    """Query class."""

    def __init__(
        self,
        dump_filepath: str,
        fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
    ):
        """
        Initialize the query class.

        Args:
            dump_filepath (str): filepath to the dump to be queried.
            fields (List[str], optional): fields to contained in the dump per paper.
                Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
        """
        self.dump_filepath = dump_filepath
        self.fields = fields
        self.errored = False

        try:
            self.df = pd.read_json(self.dump_filepath, lines=True)
            self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
        except ValueError as e:
            logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
            self.errored = True
        except KeyError as e:
            logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
            self.errored = True

    def search_keywords(
        self,
        keywords: List[Union[str, List[str]]],
        fields: List[str] = None,
        output_filepath: str = None,
    ) -> pd.DataFrame:
        """
        Search for papers in the dump using keywords.

        Args:
            keywords (List[str, List[str]]): Items will be AND separated. If items
                are lists themselves, they will be OR separated.
            fields (List[str], optional): fields to be used in the query search.
                Defaults to None, a.k.a. search in all fields excluding date.
            output_filepath (str, optional): optional output filepath where to store
                the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

        Returns:
            pd.DataFrame: A dataframe with one paper per row.
        """
        if fields is None:
            fields = self.fields
        fields = [field for field in fields if field != "date"]
        hits_per_field = []
        for field in fields:
            field_data = self.df[field].str.lower()
            hits_per_keyword = []
            for keyword in keywords:
                if isinstance(keyword, list):
                    query = "|".join([_.lower() for _ in keyword])
                else:
                    query = keyword.lower()
                hits_per_keyword.append(field_data.str.contains(query))
            if len(hits_per_keyword):
                keyword_hits = hits_per_keyword[0]
                for single_keyword_hits in hits_per_keyword[1:]:
                    keyword_hits &= single_keyword_hits
                hits_per_field.append(keyword_hits)
        if len(hits_per_field):
            hits = hits_per_field[0]
            for single_hits in hits_per_field[1:]:
                hits |= single_hits
        if output_filepath is not None:
            self.df[hits].to_json(output_filepath, orient="records", lines=True)
        return self.df[hits]

__init__(dump_filepath: str, fields: List[str] = ['title', 'doi', 'authors', 'abstract', 'date', 'journal'])

Initialize the query class.

Parameters:

Name Type Description Default
dump_filepath str

filepath to the dump to be queried.

required
fields List[str]

fields to contained in the dump per paper. Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].

['title', 'doi', 'authors', 'abstract', 'date', 'journal']
Source code in paperscraper/xrxiv/xrxiv_query.py
def __init__(
    self,
    dump_filepath: str,
    fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
):
    """
    Initialize the query class.

    Args:
        dump_filepath (str): filepath to the dump to be queried.
        fields (List[str], optional): fields to contained in the dump per paper.
            Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
    """
    self.dump_filepath = dump_filepath
    self.fields = fields
    self.errored = False

    try:
        self.df = pd.read_json(self.dump_filepath, lines=True)
        self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
    except ValueError as e:
        logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
        self.errored = True
    except KeyError as e:
        logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
        self.errored = True

search_keywords(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame

Search for papers in the dump using keywords.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
fields List[str]

fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.

None
output_filepath str

optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/xrxiv/xrxiv_query.py
def search_keywords(
    self,
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields (List[str], optional): fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath (str, optional): optional output filepath where to store
            the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    if fields is None:
        fields = self.fields
    fields = [field for field in fields if field != "date"]
    hits_per_field = []
    for field in fields:
        field_data = self.df[field].str.lower()
        hits_per_keyword = []
        for keyword in keywords:
            if isinstance(keyword, list):
                query = "|".join([_.lower() for _ in keyword])
            else:
                query = keyword.lower()
            hits_per_keyword.append(field_data.str.contains(query))
        if len(hits_per_keyword):
            keyword_hits = hits_per_keyword[0]
            for single_keyword_hits in hits_per_keyword[1:]:
                keyword_hits &= single_keyword_hits
            hits_per_field.append(keyword_hits)
    if len(hits_per_field):
        hits = hits_per_field[0]
        for single_hits in hits_per_field[1:]:
            hits |= single_hits
    if output_filepath is not None:
        self.df[hits].to_json(output_filepath, orient="records", lines=True)
    return self.df[hits]

dump_papers(papers: pd.DataFrame, filepath: str) -> None

Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl file with one paper per line.

Parameters:

Name Type Description Default
papers DataFrame

A dataframe of paper metadata, one paper per row.

required
filepath str

Path to dump the papers, has to end with .jsonl.

required
Source code in paperscraper/utils.py
def dump_papers(papers: pd.DataFrame, filepath: str) -> None:
    """
    Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl
    file with one paper per line.

    Args:
        papers (pd.DataFrame): A dataframe of paper metadata, one paper per row.
        filepath (str): Path to dump the papers, has to end with `.jsonl`.
    """
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}")
    if not filepath.endswith(".jsonl"):
        raise ValueError("Please provide a filepath with .jsonl extension")

    if isinstance(papers, List) and all([isinstance(p, Dict) for p in papers]):
        papers = pd.DataFrame(papers)
        logger.warning(
            "Preferably pass a pd.DataFrame, not a list of dictionaries. "
            "Passing a list is a legacy functionality that might become deprecated."
        )

    if not isinstance(papers, pd.DataFrame):
        raise TypeError(f"papers must be a pd.DataFrame, not {type(papers)}")

    paper_list = list(papers.T.to_dict().values())

    with open(filepath, "w") as f:
        for paper in paper_list:
            f.write(json.dumps(paper) + "\n")

get_server_dumps_dir() -> str

Return the filesystem path to the bundled server_dumps directory.

Source code in paperscraper/utils.py
def get_server_dumps_dir() -> str:
    """Return the filesystem path to the bundled server_dumps directory."""
    return str(resources.files("paperscraper").joinpath("server_dumps"))

get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
start_date str

Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'

Returns:

Name Type Description
str str

query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py
def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter

get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame

Search for papers in the dump using keywords.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
fields List[str]

fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.

None
output_filepath str

optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
query str

Query to arxiv API. Needs to match the arxiv API notation.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
max_results int

Maximal number of results, defaults to 99999.

99999
client_options Dict

Optional arguments for arxiv.Client. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.

{'num_retries': 10}
search_options Dict

Optional arguments for arxiv.Search. E.g.: id_list (List), sort_by, or sort_order.

dict()

Returns:

Type Description
DataFrame

pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args, **kwargs)

Combines get_arxiv_papers and dump_papers.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
backend Literal['api', 'local', 'infer']

If api, the arXiv API is queried. If local the local arXiv dump is queried (has to be downloaded before). If infer the local dump will be used if exists, otherwise API will be queried. Defaults to api since it is faster.

'api'
Source code in paperscraper/arxiv/arxiv.py
def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args,
    **kwargs,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args, **kwargs are additional arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

arxiv

get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame

Search for papers in the dump using keywords.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
fields List[str]

fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.

None
output_filepath str

optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
query str

Query to arxiv API. Needs to match the arxiv API notation.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
max_results int

Maximal number of results, defaults to 99999.

99999
client_options Dict

Optional arguments for arxiv.Client. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.

{'num_retries': 10}
search_options Dict

Optional arguments for arxiv.Search. E.g.: id_list (List), sort_by, or sort_order.

dict()

Returns:

Type Description
DataFrame

pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args, **kwargs)

Combines get_arxiv_papers and dump_papers.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
backend Literal['api', 'local', 'infer']

If api, the arXiv API is queried. If local the local arXiv dump is queried (has to be downloaded before). If infer the local dump will be used if exists, otherwise API will be queried. Defaults to api since it is faster.

'api'
Source code in paperscraper/arxiv/arxiv.py
def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args,
    **kwargs,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args, **kwargs are additional arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

kaggle

Kaggle-backed arXiv metadata dumping utilities.

arxiv_kaggle(start_date: datetime, end_date: datetime, save_path: str, kaggle_filepath: Optional[str] = None) -> int

Convert a Kaggle arXiv metadata snapshot to paperscraper JSONL format.

Parameters:

Name Type Description Default
start_date datetime

Earliest paper submission date to include.

required
end_date datetime

Latest paper submission date to include.

required
save_path str

Destination JSONL path for converted papers.

required
kaggle_filepath Optional[str]

Existing Kaggle snapshot file. If provided, no Kaggle download is attempted.

None

Returns:

Type Description
int

Number of papers written to save_path.

Source code in paperscraper/arxiv/kaggle.py
def arxiv_kaggle(
    start_date: datetime,
    end_date: datetime,
    save_path: str,
    kaggle_filepath: Optional[str] = None,
) -> int:
    """Convert a Kaggle arXiv metadata snapshot to paperscraper JSONL format.

    Args:
        start_date: Earliest paper submission date to include.
        end_date: Latest paper submission date to include.
        save_path: Destination JSONL path for converted papers.
        kaggle_filepath: Existing Kaggle snapshot file. If provided, no Kaggle
            download is attempted.

    Returns:
        Number of papers written to `save_path`.
    """
    cleanup_dir = default_kaggle_dir() if kaggle_filepath is None else None
    if kaggle_filepath is None:
        kaggle_filepath = download_kaggle_snapshot()

    try:
        written = 0
        os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
        total_size = os.path.getsize(kaggle_filepath)
        with (
            open(kaggle_filepath, "r", encoding="utf-8") as in_fp,
            open(save_path, "w", encoding="utf-8") as out_fp,
            tqdm(
                total=total_size,
                desc="Converting arXiv Kaggle snapshot",
                unit="B",
                unit_scale=True,
            ) as progress_bar,
        ):
            for line in in_fp:
                progress_bar.update(len(line.encode("utf-8")))
                if not line.strip():
                    continue

                record = json.loads(line)
                paper_date = get_kaggle_paper_date(record)
                if paper_date is None or not start_date <= paper_date <= end_date:
                    continue

                if written > 0:
                    out_fp.write(os.linesep)
                out_fp.write(json.dumps(normalize_kaggle_record(record, paper_date)))
                written += 1
        return written
    finally:
        if cleanup_dir is not None:
            shutil.rmtree(cleanup_dir, ignore_errors=True)

download_kaggle_snapshot() -> str

Download the Kaggle arXiv metadata snapshot if needed.

Returns:

Type Description
str

Path to the local Kaggle snapshot JSON file.

Raises:

Type Description
ImportError

If the kaggle package is not installed.

RuntimeError

If Kaggle authentication is missing or invalid.

FileNotFoundError

If the download succeeds but no snapshot JSON is found.

Source code in paperscraper/arxiv/kaggle.py
def download_kaggle_snapshot() -> str:
    """Download the Kaggle arXiv metadata snapshot if needed.

    Returns:
        Path to the local Kaggle snapshot JSON file.

    Raises:
        ImportError: If the `kaggle` package is not installed.
        RuntimeError: If Kaggle authentication is missing or invalid.
        FileNotFoundError: If the download succeeds but no snapshot JSON is found.
    """
    kaggle_dir = default_kaggle_dir()
    os.makedirs(kaggle_dir, exist_ok=True)

    if existing_snapshot := find_kaggle_snapshot(kaggle_dir):
        return existing_snapshot

    try:
        from kaggle.api.kaggle_api_extended import KaggleApi
    except ImportError as exc:
        raise ImportError(
            "The Kaggle backend requires the `kaggle` package. Install it with "
            "`pip install kaggle` or `uv add kaggle`."
        ) from exc
    except SystemExit as exc:
        raise RuntimeError(
            "Kaggle authentication is required for the arXiv Kaggle backend. "
            "Run `kaggle auth login` or configure Kaggle credentials."
        ) from exc

    api = KaggleApi()
    try:
        api.authenticate()
    except Exception as exc:
        raise RuntimeError(
            "Kaggle authentication is required for the arXiv Kaggle backend. "
            "Run `kaggle auth login` or configure Kaggle credentials."
        ) from exc
    api.dataset_download_files(
        DEFAULT_KAGGLE_DATASET,
        path=kaggle_dir,
        unzip=True,
        quiet=False,
    )

    snapshot = find_kaggle_snapshot(kaggle_dir)
    if snapshot is None:
        raise FileNotFoundError(f"No arXiv Kaggle snapshot found in {kaggle_dir}")
    return snapshot

default_kaggle_dir() -> str

Return the default temporary directory for Kaggle arXiv downloads.

Returns:

Type Description
str

Path to the default Kaggle download directory.

Source code in paperscraper/arxiv/kaggle.py
def default_kaggle_dir() -> str:
    """Return the default temporary directory for Kaggle arXiv downloads.

    Returns:
        Path to the default Kaggle download directory.
    """
    return os.path.join(get_server_dumps_dir(), "arxiv_kaggle")

find_kaggle_snapshot(kaggle_dir: str) -> Optional[str]

Find the arXiv metadata snapshot JSON in a Kaggle download directory.

Parameters:

Name Type Description Default
kaggle_dir str

Directory to search.

required

Returns:

Type Description
Optional[str]

Path to the largest candidate JSON file, or None if no candidate exists.

Source code in paperscraper/arxiv/kaggle.py
def find_kaggle_snapshot(kaggle_dir: str) -> Optional[str]:
    """Find the arXiv metadata snapshot JSON in a Kaggle download directory.

    Args:
        kaggle_dir: Directory to search.

    Returns:
        Path to the largest candidate JSON file, or None if no candidate exists.
    """
    candidates = [
        *glob.glob(os.path.join(kaggle_dir, "arxiv-metadata*.json")),
        *glob.glob(os.path.join(kaggle_dir, "*.json")),
    ]
    candidates = [path for path in candidates if os.path.isfile(path)]
    if not candidates:
        return None
    return sorted(candidates, key=os.path.getsize, reverse=True)[0]

get_kaggle_paper_date(record: dict) -> Optional[datetime]

Extract the first submission date from a Kaggle arXiv record.

Parameters:

Name Type Description Default
record dict

Raw Kaggle arXiv metadata record.

required

Returns:

Type Description
Optional[datetime]

Naive UTC-normalized submission date at midnight, or None if no usable

Optional[datetime]

date is available.

Source code in paperscraper/arxiv/kaggle.py
def get_kaggle_paper_date(record: dict) -> Optional[datetime]:
    """Extract the first submission date from a Kaggle arXiv record.

    Args:
        record: Raw Kaggle arXiv metadata record.

    Returns:
        Naive UTC-normalized submission date at midnight, or None if no usable
        date is available.
    """
    created = next(
        (version.get("created") for version in record.get("versions", []) if version),
        None,
    )
    if created:
        try:
            date = parsedate_to_datetime(created)
            if date.tzinfo is not None:
                date = date.astimezone(timezone.utc).replace(tzinfo=None)
            return date.replace(hour=0, minute=0, second=0, microsecond=0)
        except (TypeError, ValueError):
            pass

    update_date = record.get("update_date")
    if update_date:
        try:
            return datetime.strptime(update_date, "%Y-%m-%d")
        except ValueError:
            return None
    return None

normalize_kaggle_record(record: dict, paper_date: datetime) -> dict

Normalize a Kaggle arXiv record to paperscraper dump fields.

Parameters:

Name Type Description Default
record dict

Raw Kaggle arXiv metadata record.

required
paper_date datetime

Submission date returned by get_kaggle_paper_date.

required

Returns:

Type Description
dict

Dictionary with paperscraper's standard title, authors, date,

dict

abstract, journal, and doi fields.

Source code in paperscraper/arxiv/kaggle.py
def normalize_kaggle_record(record: dict, paper_date: datetime) -> dict:
    """Normalize a Kaggle arXiv record to paperscraper dump fields.

    Args:
        record: Raw Kaggle arXiv metadata record.
        paper_date: Submission date returned by `get_kaggle_paper_date`.

    Returns:
        Dictionary with paperscraper's standard `title`, `authors`, `date`,
        `abstract`, `journal`, and `doi` fields.
    """
    arxiv_id = str(record.get("id", "")).split("v")[0]
    return {
        "title": normalize_whitespace(record.get("title", "")),
        "authors": normalize_whitespace(record.get("authors", "")),
        "date": paper_date.strftime("%Y-%m-%d"),
        "abstract": normalize_whitespace(record.get("abstract", "")),
        "journal": normalize_whitespace(record.get("journal-ref", "")),
        "doi": record.get("doi") or f"10.48550/arXiv.{arxiv_id}",
    }

utils

format_date(date_str: str) -> str

Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format.

Source code in paperscraper/arxiv/utils.py
def format_date(date_str: str) -> str:
    """Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format."""
    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
    return date_obj.strftime("%Y%m%d0000")

get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
start_date str

Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'

Returns:

Name Type Description
str str

query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py
def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter