Skip to content

paperscraper.arxiv

paperscraper.arxiv

XRXivQuery

Query class.

Source code in paperscraper/xrxiv/xrxiv_query.py
class XRXivQuery:
    """Query class."""

    def __init__(
        self,
        dump_filepath: str,
        fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
    ):
        """
        Initialize the query class.

        Args:
            dump_filepath (str): filepath to the dump to be queried.
            fields (List[str], optional): fields to contained in the dump per paper.
                Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
        """
        self.dump_filepath = dump_filepath
        self.fields = fields
        self.errored = False

        try:
            self.df = pd.read_json(self.dump_filepath, lines=True)
            self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
        except ValueError as e:
            logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
            self.errored = True
        except KeyError as e:
            logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
            self.errored = True

    def search_keywords(
        self,
        keywords: List[Union[str, List[str]]],
        fields: List[str] = None,
        output_filepath: str = None,
    ) -> pd.DataFrame:
        """
        Search for papers in the dump using keywords.

        Args:
            keywords (List[str, List[str]]): Items will be AND separated. If items
                are lists themselves, they will be OR separated.
            fields (List[str], optional): fields to be used in the query search.
                Defaults to None, a.k.a. search in all fields excluding date.
            output_filepath (str, optional): optional output filepath where to store
                the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

        Returns:
            pd.DataFrame: A dataframe with one paper per row.
        """
        if fields is None:
            fields = self.fields
        fields = [field for field in fields if field != "date"]
        hits_per_field = []
        for field in fields:
            field_data = self.df[field].str.lower()
            hits_per_keyword = []
            for keyword in keywords:
                if isinstance(keyword, list):
                    query = "|".join([_.lower() for _ in keyword])
                else:
                    query = keyword.lower()
                hits_per_keyword.append(field_data.str.contains(query))
            if len(hits_per_keyword):
                keyword_hits = hits_per_keyword[0]
                for single_keyword_hits in hits_per_keyword[1:]:
                    keyword_hits &= single_keyword_hits
                hits_per_field.append(keyword_hits)
        if len(hits_per_field):
            hits = hits_per_field[0]
            for single_hits in hits_per_field[1:]:
                hits |= single_hits
        if output_filepath is not None:
            self.df[hits].to_json(output_filepath, orient="records", lines=True)
        return self.df[hits]

__init__(dump_filepath: str, fields: List[str] = ['title', 'doi', 'authors', 'abstract', 'date', 'journal'])

Initialize the query class.

Parameters:

Name Type Description Default
dump_filepath str

filepath to the dump to be queried.

required
fields List[str]

fields to contained in the dump per paper. Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].

['title', 'doi', 'authors', 'abstract', 'date', 'journal']
Source code in paperscraper/xrxiv/xrxiv_query.py
def __init__(
    self,
    dump_filepath: str,
    fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
):
    """
    Initialize the query class.

    Args:
        dump_filepath (str): filepath to the dump to be queried.
        fields (List[str], optional): fields to contained in the dump per paper.
            Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
    """
    self.dump_filepath = dump_filepath
    self.fields = fields
    self.errored = False

    try:
        self.df = pd.read_json(self.dump_filepath, lines=True)
        self.df["date"] = [date.strftime("%Y-%m-%d") for date in self.df["date"]]
    except ValueError as e:
        logger.warning(f"Problem in reading file {dump_filepath}: {e} - Skipping!")
        self.errored = True
    except KeyError as e:
        logger.warning(f"Key {e} missing in file from {dump_filepath} - Skipping!")
        self.errored = True

search_keywords(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame

Search for papers in the dump using keywords.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
fields List[str]

fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.

None
output_filepath str

optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/xrxiv/xrxiv_query.py
def search_keywords(
    self,
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields (List[str], optional): fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath (str, optional): optional output filepath where to store
            the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    if fields is None:
        fields = self.fields
    fields = [field for field in fields if field != "date"]
    hits_per_field = []
    for field in fields:
        field_data = self.df[field].str.lower()
        hits_per_keyword = []
        for keyword in keywords:
            if isinstance(keyword, list):
                query = "|".join([_.lower() for _ in keyword])
            else:
                query = keyword.lower()
            hits_per_keyword.append(field_data.str.contains(query))
        if len(hits_per_keyword):
            keyword_hits = hits_per_keyword[0]
            for single_keyword_hits in hits_per_keyword[1:]:
                keyword_hits &= single_keyword_hits
            hits_per_field.append(keyword_hits)
    if len(hits_per_field):
        hits = hits_per_field[0]
        for single_hits in hits_per_field[1:]:
            hits |= single_hits
    if output_filepath is not None:
        self.df[hits].to_json(output_filepath, orient="records", lines=True)
    return self.df[hits]

dump_papers(papers: pd.DataFrame, filepath: str) -> None

Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl file with one paper per line.

Parameters:

Name Type Description Default
papers DataFrame

A dataframe of paper metadata, one paper per row.

required
filepath str

Path to dump the papers, has to end with .jsonl.

required
Source code in paperscraper/utils.py
def dump_papers(papers: pd.DataFrame, filepath: str) -> None:
    """
    Receives a pd.DataFrame, one paper per row and dumps it into a .jsonl
    file with one paper per line.

    Args:
        papers (pd.DataFrame): A dataframe of paper metadata, one paper per row.
        filepath (str): Path to dump the papers, has to end with `.jsonl`.
    """
    if not isinstance(filepath, str):
        raise TypeError(f"filepath must be a string, not {type(filepath)}")
    if not filepath.endswith(".jsonl"):
        raise ValueError("Please provide a filepath with .jsonl extension")

    if isinstance(papers, List) and all([isinstance(p, Dict) for p in papers]):
        papers = pd.DataFrame(papers)
        logger.warning(
            "Preferably pass a pd.DataFrame, not a list of dictionaries. "
            "Passing a list is a legacy functionality that might become deprecated."
        )

    if not isinstance(papers, pd.DataFrame):
        raise TypeError(f"papers must be a pd.DataFrame, not {type(papers)}")

    paper_list = list(papers.T.to_dict().values())

    with open(filepath, "w") as f:
        for paper in paper_list:
            f.write(json.dumps(paper) + "\n")

get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
start_date str

Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'

Returns:

Name Type Description
str str

query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py
def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter

get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame

Search for papers in the dump using keywords.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
fields List[str]

fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.

None
output_filepath str

optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
query str

Query to arxiv API. Needs to match the arxiv API notation.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
max_results int

Maximal number of results, defaults to 99999.

99999
client_options Dict

Optional arguments for arxiv.Client. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.

{'num_retries': 10}
search_options Dict

Optional arguments for arxiv.Search. E.g.: id_list (List), sort_by, or sort_order.

dict()

Returns:

Type Description
DataFrame

pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args, **kwargs)

Combines get_arxiv_papers and dump_papers.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
backend Literal['api', 'local', 'infer']

If api, the arXiv API is queried. If local the local arXiv dump is queried (has to be downloaded before). If infer the local dump will be used if exists, otherwise API will be queried. Defaults to api since it is faster.

'api'
Source code in paperscraper/arxiv/arxiv.py
def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args,
    **kwargs,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args, **kwargs are additional arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

arxiv

get_arxiv_papers_local(keywords: List[Union[str, List[str]]], fields: List[str] = None, output_filepath: str = None) -> pd.DataFrame

Search for papers in the dump using keywords.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
fields List[str]

fields to be used in the query search. Defaults to None, a.k.a. search in all fields excluding date.

None
output_filepath str

optional output filepath where to store the hits in JSONL format. Defaults to None, a.k.a., no export to a file.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A dataframe with one paper per row.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_local(
    keywords: List[Union[str, List[str]]],
    fields: List[str] = None,
    output_filepath: str = None,
) -> pd.DataFrame:
    """
    Search for papers in the dump using keywords.

    Args:
        keywords: Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        fields: fields to be used in the query search.
            Defaults to None, a.k.a. search in all fields excluding date.
        output_filepath: optional output filepath where to store the hits in JSONL format.
            Defaults to None, a.k.a., no export to a file.

    Returns:
        pd.DataFrame: A dataframe with one paper per row.
    """
    search_local_arxiv()
    if ARXIV_QUERIER is None:
        raise ValueError(
            "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
        )
    return ARXIV_QUERIER(
        keywords=keywords, fields=fields, output_filepath=output_filepath
    )

get_arxiv_papers_api(query: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], max_results: int = 99999, client_options: Dict = {'num_retries': 10}, search_options: Dict = dict(), verbose: bool = True) -> pd.DataFrame

Performs arxiv API request of a given query and returns list of papers with fields as desired.

Parameters:

Name Type Description Default
query str

Query to arxiv API. Needs to match the arxiv API notation.

required
fields List

List of strings with fields to keep in output.

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
max_results int

Maximal number of results, defaults to 99999.

99999
client_options Dict

Optional arguments for arxiv.Client. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'.

{'num_retries': 10}
search_options Dict

Optional arguments for arxiv.Search. E.g.: id_list (List), sort_by, or sort_order.

dict()

Returns:

Type Description
DataFrame

pd.DataFrame: One row per paper.

Source code in paperscraper/arxiv/arxiv.py
def get_arxiv_papers_api(
    query: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    max_results: int = 99999,
    client_options: Dict = {"num_retries": 10},
    search_options: Dict = dict(),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query: Query to arxiv API. Needs to match the arxiv API notation.
        fields: List of strings with fields to keep in output.
        max_results: Maximal number of results, defaults to 99999.
        client_options: Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options: Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        pd.DataFrame: One row per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query, max_results=max_results, **search_options)
    results = client.results(search)

    processed = pd.DataFrame(
        [
            {
                arxiv_field_mapper.get(key, key): process_fields.get(
                    arxiv_field_mapper.get(key, key), lambda x: x
                )(value)
                for key, value in vars(paper).items()
                if arxiv_field_mapper.get(key, key) in fields and key != "doi"
            }
            for paper in tqdm(results, desc=f"Processing {query}", disable=not verbose)
        ]
    )
    return processed

get_and_dump_arxiv_papers(keywords: List[Union[str, List[str]]], output_filepath: str, fields: List = ['title', 'authors', 'date', 'abstract', 'journal', 'doi'], start_date: str = 'None', end_date: str = 'None', backend: Literal['api', 'local', 'infer'] = 'api', *args, **kwargs)

Combines get_arxiv_papers and dump_papers.

Parameters:

Name Type Description Default
keywords List[Union[str, List[str]]]

List of keywords for arxiv search. The outer list level will be considered as AND separated keys, the inner level as OR separated.

required
output_filepath str

Path where the dump will be saved.

required
fields List

List of strings with fields to keep in output. Defaults to ['title', 'authors', 'date', 'abstract', 'journal', 'doi'].

['title', 'authors', 'date', 'abstract', 'journal', 'doi']
start_date str

Start date for the search. Needs to be in format: YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'
backend Literal['api', 'local', 'infer']

If api, the arXiv API is queried. If local the local arXiv dump is queried (has to be downloaded before). If infer the local dump will be used if exists, otherwise API will be queried. Defaults to api since it is faster.

'api'
Source code in paperscraper/arxiv/arxiv.py
def get_and_dump_arxiv_papers(
    keywords: List[Union[str, List[str]]],
    output_filepath: str,
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    backend: Literal["api", "local", "infer"] = "api",
    *args,
    **kwargs,
):
    """
    Combines get_arxiv_papers and dump_papers.

    Args:
        keywords: List of keywords for arxiv search.
            The outer list level will be considered as AND separated keys, the
            inner level as OR separated.
        output_filepath: Path where the dump will be saved.
        fields: List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
        start_date: Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date: End date for the search. Same notation as start_date.
        backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
            is queried (has to be downloaded before). If `infer` the local dump will
            be used if exists, otherwise API will be queried. Defaults to `api`
            since it is faster.
        *args, **kwargs are additional arguments for `get_arxiv_papers`.
    """
    # Translate keywords into query.
    query = get_query_from_keywords(keywords, start_date=start_date, end_date=end_date)

    if backend not in {"api", "local", "infer"}:
        raise ValueError(
            f"Invalid backend: {backend}. Must be one of ['api', 'local', 'infer']"
        )
    elif backend == "infer":
        backend = infer_backend()

    if backend == "api":
        papers = get_arxiv_papers_api(query, fields, *args, **kwargs)
    elif backend == "local":
        papers = get_arxiv_papers_local(keywords, fields, *args, **kwargs)
    dump_papers(papers, output_filepath)

utils

format_date(date_str: str) -> str

Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format.

Source code in paperscraper/arxiv/utils.py
def format_date(date_str: str) -> str:
    """Converts a date in YYYY-MM-DD format to arXiv's YYYYMMDDTTTT format."""
    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
    return date_obj.strftime("%Y%m%d0000")

get_query_from_keywords(keywords: List[Union[str, List[str]]], start_date: str = 'None', end_date: str = 'None') -> str

Receives a list of keywords and returns the query for the arxiv API.

Parameters:

Name Type Description Default
keywords List[str, List[str]]

Items will be AND separated. If items are lists themselves, they will be OR separated.

required
start_date str

Start date for the search. Needs to be in format: YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific dates are used.

'None'
end_date str

End date for the search. Same notation as start_date.

'None'

Returns:

Name Type Description
str str

query to enter to arxiv API.

Source code in paperscraper/arxiv/utils.py
def get_query_from_keywords(
    keywords: List[Union[str, List[str]]],
    start_date: str = "None",
    end_date: str = "None",
) -> str:
    """Receives a list of keywords and returns the query for the arxiv API.

    Args:
        keywords (List[str, List[str]]): Items will be AND separated. If items
            are lists themselves, they will be OR separated.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY-MM-DD, e.g. '2020-07-20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.

    Returns:
        str: query to enter to arxiv API.
    """

    query = ""
    for i, key in enumerate(keywords):
        if isinstance(key, str):
            query += f"all:{key} AND "
        elif isinstance(key, list):
            inter = "".join([f"all:{syn} OR " for syn in key])
            query += finalize_disjunction(inter)

    query = finalize_conjunction(query)
    if start_date == "None" and end_date == "None":
        return query
    elif start_date == "None":
        start_date = EARLIEST_START
    elif end_date == "None":
        end_date = datetime.now().strftime("%Y-%m-%d")

    start = format_date(start_date)
    end = format_date(end_date)
    date_filter = f" AND submittedDate:[{start} TO {end}]"
    return query + date_filter