paperscraper.arxiv

`paperscraper.arxiv` ¶

`XRXivQuery` ¶

Query class.

Source code in paperscraper/xrxiv/xrxiv_query.py

class XRXivQuery:
    """Query class."""

    def __init__(
        self,
        dump_filepath: str,
        fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
    ):
        """
        Initialize the query class.

        Args:
            dump_filepath (str): filepath to the dump to be queried.
            fields (List[str], optional): fields to contained in the dump per paper.
                Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
        """
        self.dump_filepath = dump_filepath
        self.fields = fields
        self.errored = False

        try:
            self.df = pd.read_json(self.dump_filepath, lines=True)
            self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
        except ValueError as e:
            logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
            self.errored = True
        except KeyError as e:
            logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
            self.errored = True

    def search_keywords(
        self,
        keywords: List[Union[str, List[str]]],
        fields: List[str] = None,
        output_filepath: str = None,
    ) -> pd.DataFrame:
        """
        Search for papers in the dump using keywords.

        Args:
            keywords (List[str, List[str]]): Items will be AND separated. If items
                are lists themselves, they will be OR separated.
            fields (List[str], optional): fields to be used in the query search.
                Defaults to None, a.k.a. search in all fields excluding date.
            output_filepath (str, optional): optional output filepath where to store
                the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

        Returns:
            pd.DataFrame: A dataframe with one paper per row.
        """
        if fields is None:
            fields = self.fields
        fields = [field for field in fields if field != "date"]
        hits_per_field = []
        for field in fields:
            field_data = self.df[field].str.lower()
            hits_per_keyword = []
            for keyword in keywords:
                if isinstance(keyword, list):
                    query = "|".join([_.lower() for _ in keyword])
                else:
                    query = keyword.lower()
                hits_per_keyword.append(field_data.str.contains(query))
            if len(hits_per_keyword):
                keyword_hits = hits_per_keyword[0]
                for single_keyword_hits in hits_per_keyword[1:]:
                    keyword_hits &= single_keyword_hits
                hits_per_field.append(keyword_hits)
        if len(hits_per_field):
            hits = hits_per_field[0]
            for single_hits in hits_per_field[1:]:
                hits |= single_hits
        if output_filepath is not None:
            self.df[hits].to_json(output_filepath, orient="records", lines=True)
        return self.df[hits]

`init(dump_filepath: str, fields: List[str] = ['title', 'doi', 'authors', 'abstract', 'date', 'journal'])` ¶

Initialize the query class.

Parameters:

Name	Type	Description	Default
`dump_filepath`	`str`	filepath to the dump to be queried.	required
`fields`	`List[str]`	fields to contained in the dump per paper. Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].	`['title', 'doi', 'authors', 'abstract', 'date', 'journal']`

Source code in paperscraper/xrxiv/xrxiv_query.py

def __init__(
    self,
    dump_filepath: str,
    fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
):
    """
    Initialize the query class.

    Args:
        dump_filepath (str): filepath to the dump to be queried.
        fields (List[str], optional): fields to contained in the dump per paper.
            Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
    """
    self.dump_filepath = dump_filepath
    self.fields = fields
    self.errored = False

    try:
        self.df = pd.read_json(self.dump_filepath, lines=True)
        self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
    except ValueError as e:
        logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
        self.errored = True
    except KeyError as e:
        logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
        self.errored = True

`search_keywords(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame` ¶

Search for papers in the dump using keywords.

Parameters:

Name	Type	Description	Default
`keywords`	`List[str, List[str]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`fields`	`List[str]`	fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.	`None`
`output_filepath`	`str`	optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/xrxiv/xrxiv_query.py

def search_keywords(
    self,
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields (List[str], optional): fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath (str, optional): optional output filepath where to store
            the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    if fields is None:
        fields = self.fields
    fields = [field for field in fields if field != "date"]
    hits_per_field = []
    for field in fields:
        field_data = self.df[field].str.lower()
        hits_per_keyword = []
        for keyword in keywords:
            if isinstance(keyword, list):
                query = "|".join([_.lower() for _ in keyword])
            else:
                query = keyword.lower()
            hits_per_keyword.append(field_data.str.contains(query))
        if len(hits_per_keyword):
            keyword_hits = hits_per_keyword[0]
            for single_keyword_hits in hits_per_keyword[1:]:
                keyword_hits &= single_keyword_hits
            hits_per_field.append(keyword_hits)
    if len(hits_per_field):
        hits = hits_per_field[0]
        for single_hits in hits_per_field[1:]:
            hits |= single_hits
    if output_filepath is not None:
        self.df[hits].to_json(output_filepath, orient="records", lines=True)
    return self.df[hits]

`dump_papers(papers: pd.DataFrame, filepath: str) -> None` ¶

Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl file with one paper per line.

Parameters:

Name	Type	Description	Default
`papers`	`DataFrame`	A dataframe of paper metadata, one paper per row.	required
`filepath`	`str`	Path to dump the papers, has to end with `.jsonl`.	required

Source code in paperscraper/utils.py

def dump_papers(papers: pd.DataFrame, filepath: str) -> None:
    """
    Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl
    file with one paper per line.

    Args:
        papers (pd.DataFrame): A dataframe of paper metadata, one paper per row.
        filepath (str): Path to dump the papers, has to end with `.jsonl`.
    """
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}")
    if not filepath.endswith(".jsonl"):
        raise ValueError("Please provide a filepath with .jsonl extension")

    if isinstance(papers, List) and all([isinstance(p, Dict) for p in papers]):
        papers = pd.DataFrame(papers)
        logger.warning(
            "Preferably pass a pd.DataFrame, not a list of dictionaries. "
            "Passing a list is a legacy functionality that might become deprecated."
        )

    if not isinstance(papers, pd.DataFrame):
        raise TypeError(f"papers must be a pd.DataFrame, not {type(papers)}")

    paper_list = list(papers.T.to_dict().values())

    with open(filepath, "w") as f:
        for paper in paper_list:
            f.write(json.dumps(paper) + "\n")

`get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str` ¶

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name	Type	Description	Default
`keywords`	`List[str, List[str]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`

Returns:

Name	Type	Description
`str`	`str`	query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py

def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter

`get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame` ¶

Search for papers in the dump using keywords.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`fields`	`List[str]`	fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.	`None`
`output_filepath`	`str`	optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

`get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame` ¶

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name	Type	Description	Default
`query`	`str`	Query to arxiv API. Needs to match the arxiv API notation.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`max_results`	`int`	Maximal number of results, defaults to 99999.	`99999`
`client_options`	`Dict`	Optional arguments for `arxiv.Client`. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.	`{'num_retries': 10}`
`search_options`	`Dict`	Optional arguments for `arxiv.Search`. E.g.: id_list (List), sort_by, or sort_order.	`dict()`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

`get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args, **kwargs)` ¶

Combines get_arxiv_papers and dump_papers.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.	required
`output_filepath`	`str`	Path where the dump will be saved.	required
`fields`	`List`	List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`
`backend`	`Literal['api', 'local', 'infer']`	If `api`, the arXiv API is queried. If `local` the local arXiv dump is queried (has to be downloaded before). If `infer` the local dump will be used if exists, otherwise API will be queried. Defaults to `api` since it is faster.	`'api'`

Source code in paperscraper/arxiv/arxiv.py

def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args,
    **kwargs,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args, **kwargs are additional arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

`arxiv` ¶

`get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame` ¶

Search for papers in the dump using keywords.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`fields`	`List[str]`	fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.	`None`
`output_filepath`	`str`	optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

`get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame` ¶

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name	Type	Description	Default
`query`	`str`	Query to arxiv API. Needs to match the arxiv API notation.	required
`fields`	`List`	List of strings with fields to keep in output.	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`max_results`	`int`	Maximal number of results, defaults to 99999.	`99999`
`client_options`	`Dict`	Optional arguments for `arxiv.Client`. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.	`{'num_retries': 10}`
`search_options`	`Dict`	Optional arguments for `arxiv.Search`. E.g.: id_list (List), sort_by, or sort_order.	`dict()`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py

def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

`get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args, **kwargs)` ¶

Combines get_arxiv_papers and dump_papers.

Parameters:

Name	Type	Description	Default
`keywords`	`List[Union[str, List[str]]]`	List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.	required
`output_filepath`	`str`	Path where the dump will be saved.	required
`fields`	`List`	List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].	`['title', 'authors', 'date', 'abstract', 'journal', 'doi']`
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`
`backend`	`Literal['api', 'local', 'infer']`	If `api`, the arXiv API is queried. If `local` the local arXiv dump is queried (has to be downloaded before). If `infer` the local dump will be used if exists, otherwise API will be queried. Defaults to `api` since it is faster.	`'api'`

Source code in paperscraper/arxiv/arxiv.py

def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args,
    **kwargs,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args, **kwargs are additional arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

`utils` ¶

`format_date(date_str: str) -> str` ¶

Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format.

Source code in paperscraper/arxiv/utils.py

def format_date(date_str: str) -> str:
    """Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format."""
    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
    return date_obj.strftime("%Y%m%d0000")

`get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str` ¶

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name	Type	Description	Default
`keywords`	`List[str, List[str]]`	Items will be AND separated. If items are lists themselves, they will be OR separated.	required
`start_date`	`str`	Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.	`'None'`
`end_date`	`str`	End date for the search. Same notation as start_date.	`'None'`

Returns:

Name	Type	Description
`str`	`str`	query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py

def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter

paperscraper.arxiv