跳转至

Arxiv

labridge.func_modules.paper.download.arxiv

labridge.func_modules.paper.download.arxiv.ArxivCategory

Bases: object

The research fields category from arXiv.

ATTRIBUTE DESCRIPTION
category

a dict containing sub dicts. - key: the research fields group name. - value: a sub dict containing the research fields categories. Each sub dict contains:

    - key: the research fields category name.
    - value: the description of this category.

TYPE: dict

persist_path

the storing path of the category dict.

TYPE: str

arxiv_category_url

the url of the arxiv category.

TYPE: str

PARAMETER DESCRIPTION
persist_path

the storing path of the category dict.

TYPE: str DEFAULT: None

Source code in labridge\func_modules\paper\download\arxiv.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class ArxivCategory(object):
	r"""
	The research fields category from arXiv.

	Attributes:
		category (dict): a dict containing sub dicts.
			- key: the research fields group name.
			- value: a sub dict containing the research fields categories.
			Each sub dict contains:

				- key: the research fields category name.
				- value: the description of this category.
		persist_path (str): the storing path of the category dict.
		arxiv_category_url (str): the url of the arxiv category.

	Args:
		persist_path (str): the storing path of the category dict.
	"""
	category: dict
	persist_path: str
	arxiv_category_url: str = "https://arxiv.org/category_taxonomy"

	def __init__(self, persist_path: Optional[str] = None):
		self.persist_path = persist_path or self._default_persist_path()
		if Path(self.persist_path).exists():
			self.category = self.load_category()
		else:
			self.category = self.category_from_arxiv()
			self.save_category()

	def _default_persist_path(self) -> str:
		r""" Default persist path. """
		root = Path(__file__)
		for i in range(5):
			root = root.parent
		return str(root / ARXIV_CATEGORY_PATH)

	def category_from_arxiv(self, arxiv_category_url: Optional[str] = None) -> dict:
		r"""
		Parse categories from arxiv.

		Args:
			arxiv_category_url (Optional[str]): Generally, the url is "https://arxiv.org/category_taxonomy".

		Returns:
			dict: The category dict in the following format:
				`{Group: {Category: description (str)}}`
		"""
		arxiv_category_url = arxiv_category_url or self.arxiv_category_url
		web_reader = SimpleWebPageReader(html_to_text=True)
		web_text = web_reader.load_data([arxiv_category_url])
		text = web_text[0].text
		fields_str = text.split("Category description if available")[1]
		fields_dict= dict()
		line_list = fields_str.split('\n')

		description = []
		group = None
		category = None
		for line in line_list:
			line_items = line.split()
			if line_items and line_items[0] == "##":
				group = " ".join(line_items[1:])
				fields_dict[group] = {}
				category = None
			elif line_items and line_items[0] == "####":
				if category is not None:
					fields_dict[group][category] = " ".join(description)
				category = line_items[1]
				description = [f"{' '.join(line_items[2:])}:"]
			else:
				description.append(line)

		fields_dict[group][category] = " ".join(description)

		# Extra information.
		for group in Extra_Descriptions.keys():
			for category in Extra_Descriptions[group].keys():
				fields_dict[group][category] += Extra_Descriptions[group][category]

		return fields_dict

	def load_category(self, persist_path: Optional[str] = None, fs: Optional[fsspec.AbstractFileSystem] = None):
		"""Load the research categories from a persist path."""
		fs = fs or fsspec.filesystem("file")
		persist_path = persist_path or self.persist_path
		with fs.open(persist_path, "rb") as f:
			category = json.load(f)
		return category

	def save_category(self, persist_path: Optional[str] = None, fs: Optional[fsspec.AbstractFileSystem] = None):
		"""Save the research categories from a persist path."""
		persist_path = persist_path or self.persist_path
		fs = fs or fsspec.filesystem("file")
		dirpath = str(Path(persist_path).parent)
		if not fs.exists(dirpath):
			fs.makedirs(dirpath)

		with fs.open(persist_path, "w") as f:
			f.write(json.dumps(self.category))

labridge.func_modules.paper.download.arxiv.ArxivCategory.category_from_arxiv(arxiv_category_url=None)

Parse categories from arxiv.

PARAMETER DESCRIPTION
arxiv_category_url

Generally, the url is "https://arxiv.org/category_taxonomy".

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
dict

The category dict in the following format: {Group: {Category: description (str)}}

TYPE: dict

Source code in labridge\func_modules\paper\download\arxiv.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def category_from_arxiv(self, arxiv_category_url: Optional[str] = None) -> dict:
	r"""
	Parse categories from arxiv.

	Args:
		arxiv_category_url (Optional[str]): Generally, the url is "https://arxiv.org/category_taxonomy".

	Returns:
		dict: The category dict in the following format:
			`{Group: {Category: description (str)}}`
	"""
	arxiv_category_url = arxiv_category_url or self.arxiv_category_url
	web_reader = SimpleWebPageReader(html_to_text=True)
	web_text = web_reader.load_data([arxiv_category_url])
	text = web_text[0].text
	fields_str = text.split("Category description if available")[1]
	fields_dict= dict()
	line_list = fields_str.split('\n')

	description = []
	group = None
	category = None
	for line in line_list:
		line_items = line.split()
		if line_items and line_items[0] == "##":
			group = " ".join(line_items[1:])
			fields_dict[group] = {}
			category = None
		elif line_items and line_items[0] == "####":
			if category is not None:
				fields_dict[group][category] = " ".join(description)
			category = line_items[1]
			description = [f"{' '.join(line_items[2:])}:"]
		else:
			description.append(line)

	fields_dict[group][category] = " ".join(description)

	# Extra information.
	for group in Extra_Descriptions.keys():
		for category in Extra_Descriptions[group].keys():
			fields_dict[group][category] += Extra_Descriptions[group][category]

	return fields_dict

labridge.func_modules.paper.download.arxiv.ArxivCategory.load_category(persist_path=None, fs=None)

Load the research categories from a persist path.

Source code in labridge\func_modules\paper\download\arxiv.py
178
179
180
181
182
183
184
def load_category(self, persist_path: Optional[str] = None, fs: Optional[fsspec.AbstractFileSystem] = None):
	"""Load the research categories from a persist path."""
	fs = fs or fsspec.filesystem("file")
	persist_path = persist_path or self.persist_path
	with fs.open(persist_path, "rb") as f:
		category = json.load(f)
	return category

labridge.func_modules.paper.download.arxiv.ArxivCategory.save_category(persist_path=None, fs=None)

Save the research categories from a persist path.

Source code in labridge\func_modules\paper\download\arxiv.py
186
187
188
189
190
191
192
193
194
195
def save_category(self, persist_path: Optional[str] = None, fs: Optional[fsspec.AbstractFileSystem] = None):
	"""Save the research categories from a persist path."""
	persist_path = persist_path or self.persist_path
	fs = fs or fsspec.filesystem("file")
	dirpath = str(Path(persist_path).parent)
	if not fs.exists(dirpath):
		fs.makedirs(dirpath)

	with fs.open(persist_path, "w") as f:
		f.write(json.dumps(self.category))

labridge.func_modules.paper.download.arxiv.ArxivClient

Bases: Client

Similar to the class Client in the package arxiv. The method _format_url is corrected here to enable advanced search.

For details about advanced search in arXiv, refer to Details of Query Construction

Advanced search fields: | prefix | explanation | |:---------:|:-----------------:| |ti |Title | |au |Author | |abs |Abstract | |co |Comment | |jr |Journal Reference | |cat |Subject Category | |rn |Report Number | |id_list |Id list | |all |All of the above |

Source code in labridge\func_modules\paper\download\arxiv.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class ArxivClient(Client):
	r"""
	Similar to the class `Client` in the package `arxiv`.
	The method `_format_url` is corrected here to enable advanced search.

	For details about advanced search in arXiv, refer to
	[Details of Query Construction](https://info.arxiv.org/help/api/user-manual.html#query_details)

	Advanced search fields:
	|	prefix	|	explanation		|
	|:---------:|:-----------------:|
	|ti			|Title				|
	|au			|Author				|
	|abs		|Abstract			|
	|co			|Comment			|
	|jr			|Journal Reference	|
	|cat		|Subject Category	|
	|rn			|Report Number		|
	|id_list	|Id list			|
	|all		|All of the above	|
	"""

	page_size: int
	"""
	Maximum number of results fetched in a single API request. Smaller pages can
	be retrieved faster, but may require more round-trips.

	The API's limit is 2000 results per page.
	"""
	delay_seconds: float
	"""
	Number of seconds to wait between API requests.

	[arXiv's Terms of Use](https://arxiv.org/help/api/tou) ask that you "make no
	more than one request every three seconds."
	"""
	num_retries: int
	"""
	Number of times to retry a failing API request before raising an Exception.
	"""
	def __init_(self, page_size: int = 100, delay_seconds: float = 3.0, num_retries: int = 3):
		super().__init__(
			page_size=page_size,
			delay_seconds=delay_seconds,
			num_retries=num_retries
		)

	def query_format(self, url_args: dict) -> str:
		r""" Formatted url for searching in arXiv. """
		query = url_args["search_query"]
		suffix = f"search_query={query}"
		for key in url_args.keys():
			if key != "search_query":
				suffix += f"&{key}={url_args[key]}"
		return self.query_url_format.format(suffix)

	def _format_url(self, search: Search, start: int, page_size: int) -> str:
		r""" Formatted url for searching in arXiv. """
		url_args = search._url_args()
		url_args.update(
			{
				"start": start,
				"max_results": page_size,
			}
		)
		return self.query_format(url_args)

labridge.func_modules.paper.download.arxiv.ArxivClient.delay_seconds: float instance-attribute

Number of seconds to wait between API requests.

arXiv's Terms of Use ask that you "make no more than one request every three seconds."

labridge.func_modules.paper.download.arxiv.ArxivClient.num_retries: int instance-attribute

Number of times to retry a failing API request before raising an Exception.

labridge.func_modules.paper.download.arxiv.ArxivClient.page_size: int instance-attribute

Maximum number of results fetched in a single API request. Smaller pages can be retrieved faster, but may require more round-trips.

The API's limit is 2000 results per page.

labridge.func_modules.paper.download.arxiv.ArxivClient.query_format(url_args)

Formatted url for searching in arXiv.

Source code in labridge\func_modules\paper\download\arxiv.py
75
76
77
78
79
80
81
82
def query_format(self, url_args: dict) -> str:
	r""" Formatted url for searching in arXiv. """
	query = url_args["search_query"]
	suffix = f"search_query={query}"
	for key in url_args.keys():
		if key != "search_query":
			suffix += f"&{key}={url_args[key]}"
	return self.query_url_format.format(suffix)

labridge.func_modules.paper.download.arxiv.ArxivDailyDownloader

Bases: object

Get the recent relevant papers on arXiv.

ATTRIBUTE DESCRIPTION
category

Storing the research fields categories.

TYPE: ArxivCategory

client

For Fetching papers.

TYPE: ArxivClient

recent_days

papers dating back to recent_days ago from today will be obtained.

TYPE: int

Source code in labridge\func_modules\paper\download\arxiv.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
class ArxivDailyDownloader(object):
	r"""
	Get the recent relevant papers on arXiv.

	Attributes:
		category (ArxivCategory): Storing the research fields categories.
		client (ArxivClient): For Fetching papers.
		recent_days (int): papers dating back to `recent_days` ago from today will be obtained.
	"""

	category: ArxivCategory
	client: ArxivClient
	recent_days: int

	def __init__(self, recent_days: int = 1):
		self.today = datetime.date.today()
		self.category = ArxivCategory()
		self.client = ArxivClient()
		self.recent_days = recent_days
		self.search = Search(
			query="cat:cs.AI",
			sort_by=SortCriterion.SubmittedDate,
			sort_order=SortOrder.Descending,
		)

	def _is_valid_category(self, cat: str) -> bool:
		r"""
		Check if the category is valid

		Args:
			cat (str): a research category.

		Returns:
			bool: Whether the given category is a valid category in arXiv.
		"""
		cat_dict = self.category.category
		for group in cat_dict.keys():
			if cat in cat_dict[group].keys():
				return True
		return False

	def _valid_date(self, date: datetime.date, start_date: datetime.date, end_date: datetime.date) -> bool:
		r""" Check if the date is 'recent' """
		return start_date <= date <= end_date

	def get_daily_papers_info(self, relevant_categories: List[str]) -> List[Result]:
		r"""
		Get the recent papers relevant to the input categories.

		The information (e.g. Abstract, Title, Authors) of these daily papers will be sent to
		the corresponding Lab Members. The papers selected by the members will be parsed and stored
		into a proper directory.

		Args:
			relevant_categories (List[str]): The recent papers in these categories will be counted.

		Return:
			List[Result]: Recent papers information.
		"""
		query = ""
		for cat in relevant_categories:
			if self._is_valid_category(cat):
				if len(query) > 0:
					query += "+OR+"
				query += f"cat:{cat}"

		daily_papers = []
		if len(query) == 0:
			return daily_papers

		self.search.query = query
		start_date = self.today - datetime.timedelta(days=self.recent_days)
		for result in self.client.results(search=self.search):
			submit_date = result.published
			if not self._valid_date(date=submit_date, start_date=start_date, end_date=self.today):
				break
			daily_papers.append(result)
		return daily_papers

	def download_papers(self, paper_dict: Dict[Result, str]):
		r"""
		Download the selected papers.

		Args:
			paper_dict (Dict[Result, str]):
				- key: paper (Result)
				- value: save_dir (str)
		"""
		for paper in paper_dict.keys():
			paper.download_pdf(dirpath=paper_dict[paper], filename=f"{paper.title}.pdf")

labridge.func_modules.paper.download.arxiv.ArxivDailyDownloader.download_papers(paper_dict)

Download the selected papers.

PARAMETER DESCRIPTION
paper_dict
  • key: paper (Result)
  • value: save_dir (str)

TYPE: Dict[Result, str]

Source code in labridge\func_modules\paper\download\arxiv.py
277
278
279
280
281
282
283
284
285
286
287
def download_papers(self, paper_dict: Dict[Result, str]):
	r"""
	Download the selected papers.

	Args:
		paper_dict (Dict[Result, str]):
			- key: paper (Result)
			- value: save_dir (str)
	"""
	for paper in paper_dict.keys():
		paper.download_pdf(dirpath=paper_dict[paper], filename=f"{paper.title}.pdf")

labridge.func_modules.paper.download.arxiv.ArxivDailyDownloader.get_daily_papers_info(relevant_categories)

Get the recent papers relevant to the input categories.

The information (e.g. Abstract, Title, Authors) of these daily papers will be sent to the corresponding Lab Members. The papers selected by the members will be parsed and stored into a proper directory.

PARAMETER DESCRIPTION
relevant_categories

The recent papers in these categories will be counted.

TYPE: List[str]

Return

List[Result]: Recent papers information.

Source code in labridge\func_modules\paper\download\arxiv.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def get_daily_papers_info(self, relevant_categories: List[str]) -> List[Result]:
	r"""
	Get the recent papers relevant to the input categories.

	The information (e.g. Abstract, Title, Authors) of these daily papers will be sent to
	the corresponding Lab Members. The papers selected by the members will be parsed and stored
	into a proper directory.

	Args:
		relevant_categories (List[str]): The recent papers in these categories will be counted.

	Return:
		List[Result]: Recent papers information.
	"""
	query = ""
	for cat in relevant_categories:
		if self._is_valid_category(cat):
			if len(query) > 0:
				query += "+OR+"
			query += f"cat:{cat}"

	daily_papers = []
	if len(query) == 0:
		return daily_papers

	self.search.query = query
	start_date = self.today - datetime.timedelta(days=self.recent_days)
	for result in self.client.results(search=self.search):
		submit_date = result.published
		if not self._valid_date(date=submit_date, start_date=start_date, end_date=self.today):
			break
		daily_papers.append(result)
	return daily_papers

labridge.func_modules.paper.download.arxiv.ArxivSearcher

Bases: object

This class searches for papers in the arxiv.

ATTRIBUTE DESCRIPTION
max_results_num

Maximum number of results in a search.

TYPE: int

category

The arXiv research category.

TYPE: ArxivCategory

client

Client responsible for searching.

TYPE: ArxivClient

searcher

The parameters of searching.

TYPE: Search

Source code in labridge\func_modules\paper\download\arxiv.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
class ArxivSearcher(object):
	r"""
	This class searches for papers in the arxiv.

	Attributes:
		max_results_num (int): Maximum number of results in a search.
		category (ArxivCategory): The arXiv research category.
		client (ArxivClient): Client responsible for searching.
		searcher (Search): The parameters of searching.
	"""
	def __init__(self, max_results_num: int = 5):
		self.max_results_num = max_results_num
		self.category = ArxivCategory()
		self.client = ArxivClient()
		self.searcher = Search(
			query="",
			sort_by=SortCriterion.Relevance,
			sort_order=SortOrder.Descending,
		)

	def construct_query(
		self,
		search_str: str,
		search_mode: ArxivSearchMode,
	) -> str:
		if search_mode == ArxivSearchMode.Title:
			return f"ti:{search_str}"
		elif search_mode == ArxivSearchMode.DOI:
			return f"doi:{search_str}"
		elif search_mode == ArxivSearchMode.TitleAbstract:
			return f"ti:{search_str}+OR+abs:{search_str}"
		else:
			raise ValueError("Unsupported search mode.")

	def search(
		self,
		search_str: str,
		max_results_num: int = None,
		search_mode: ArxivSearchMode = None,
	) -> List[Result]:
		r"""
		Search according to the title or abstract.

		Args:
			search_str (str): The search string, typically the title or abstract.
			max_results_num (int): Maximum num of results. Defaults to None.
			search_mode (ArxivSearchMode): Search mode.

		Returns:
			List[Result]: The search results.
		"""
		# query = f"ti:{search_str}+OR+abs:{search_str}"
		# query = f"ti:{search_str}"
		search_mode = search_mode or ArxivSearchMode.Title
		self.searcher.query = self.construct_query(
			search_str=search_str,
			search_mode=search_mode,
		)
		max_results_num = max_results_num or self.max_results_num
		count = 0
		results = []
		for result in self.client.results(search=self.searcher):
			count += 1
			results.append(result)
			if count >= max_results_num:
				break
		return results

labridge.func_modules.paper.download.arxiv.ArxivSearcher.search(search_str, max_results_num=None, search_mode=None)

Search according to the title or abstract.

PARAMETER DESCRIPTION
search_str

The search string, typically the title or abstract.

TYPE: str

max_results_num

Maximum num of results. Defaults to None.

TYPE: int DEFAULT: None

search_mode

Search mode.

TYPE: ArxivSearchMode DEFAULT: None

RETURNS DESCRIPTION
List[Result]

List[Result]: The search results.

Source code in labridge\func_modules\paper\download\arxiv.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def search(
	self,
	search_str: str,
	max_results_num: int = None,
	search_mode: ArxivSearchMode = None,
) -> List[Result]:
	r"""
	Search according to the title or abstract.

	Args:
		search_str (str): The search string, typically the title or abstract.
		max_results_num (int): Maximum num of results. Defaults to None.
		search_mode (ArxivSearchMode): Search mode.

	Returns:
		List[Result]: The search results.
	"""
	# query = f"ti:{search_str}+OR+abs:{search_str}"
	# query = f"ti:{search_str}"
	search_mode = search_mode or ArxivSearchMode.Title
	self.searcher.query = self.construct_query(
		search_str=search_str,
		search_mode=search_mode,
	)
	max_results_num = max_results_num or self.max_results_num
	count = 0
	results = []
	for result in self.client.results(search=self.searcher):
		count += 1
		results.append(result)
		if count >= max_results_num:
			break
	return results