跳转至

Paper download

labridge.callback.paper.paper_download

labridge.callback.paper.paper_download.ArxivDownloadOperation

Bases: CallBackOperationBase

This operation will download papers from aXiv for the user.

PARAMETER DESCRIPTION
llm

The used LLM.

TYPE: LLM DEFAULT: None

embed_model

The used embedding model.

TYPE: BaseEmbedding DEFAULT: None

verbose

Whether to show the inner progress.

TYPE: bool DEFAULT: False

Source code in labridge\callback\paper\paper_download.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
class ArxivDownloadOperation(CallBackOperationBase):
	r"""
	This operation will download papers from aXiv for the user.

	Args:
		llm (LLM): The used LLM.
		embed_model (BaseEmbedding): The used embedding model.
		verbose (bool): Whether to show the inner progress.
	"""
	def __init__(
		self,
		llm: LLM = None,
		embed_model: BaseEmbedding = None,
		verbose: bool = False,
		op_name: str = None,
	):
		root = Path(__file__)

		for idx in range(4):
			root = root.parent

		self.root = root
		self._fs = fsspec.filesystem("file")
		embed_model = embed_model or Settings.embed_model
		llm = llm or Settings.llm
		super().__init__(
			llm=llm,
			embed_model=embed_model,
			verbose=verbose,
			op_name=op_name or ArxivDownloadOperation.__name__,
		)

	def _get_default_path(self, user_id: str, title: str) -> Tuple[str, str]:
		r"""
		The downloaded paper will be stored in the user's recent paper warehouse.

		Args:
			user_id (str): The user id of a lab member.
			title (str): The title of the paper, will be used as the filename.

		Returns:
			Tuple[str, str]:
				The paper file path and file name.
		"""
		file_name = f"{title}.pdf"
		file_dir = self.root / TMP_PAPER_WAREHOUSE_DIR
		file_dir = file_dir / user_id

		if not self._fs.exists(file_dir):
			self._fs.makedirs(file_dir)
		return str(file_dir), file_name

	def operation_description(self, **kwargs) -> str:
		r"""
		Describe the operation.

		Args:
			user_id (str): the user id.
			paper_infos (List[Dict[str, str]]): the metadata of papers,
				for each paper, the `title` must be provided.

		Returns:
			the operation description.
		"""
		user_id = kwargs.get("user_id", None)
		paper_infos = kwargs.get("paper_infos", None)

		if None in [user_id, paper_infos]:
			raise ValueError("should provide valid user_id, paper_infos.")

		papers = []
		for paper in paper_infos:
			title = paper.get("title", None)
			file_dir, file_name = self._get_default_path(user_id=user_id, title=title)
			save_path = str(Path(file_dir) / file_name)
			paper_dsc = PAPER_DESCRIPTION_TMPL.format(title=title, save_path=save_path)
			papers.append(paper_dsc)
		papers = "\n\n".join(papers)
		header = ARXIV_DOWNLOAD_DESCRIPTION.format(user_id=user_id)
		description = f"{header}\n{papers}"
		return description

	def download_paper(self, user_id: str, title: str, pdf_url: str) -> Optional[str]:
		r"""
		Download a paper from arxiv and save to the user's recent paper directory.

		Args:
			user_id (str): The user id of a lab member.
			title (str): The paper title.
			pdf_url (str): The paper URL.

		Returns:
			Optional[str]:

				- If the paper is successfully downloaded, return the file_path.
				- If the downloading fails, return None.
		"""
		if None in [user_id, title, pdf_url]:
			raise ValueError("should provide valid user_id, title, pdf_url to download paper.")
		file_dir, file_name = self._get_default_path(user_id=user_id, title=title)
		result = Result(entry_id="")
		result.pdf_url = pdf_url

		if self._verbose:
			print_text(text=f"Downloading paper '{title}' ...", color="pink", end="\n")

		try:
			result.download_pdf(dirpath=file_dir, filename=file_name)
			file_path = str(Path(file_dir) / file_name)
			return file_path
		except Exception as e:
			print(f"Download failed. Error: {e}")
			return None

	async def adownload_paper(self, user_id: str, title: str, pdf_url: str) -> Optional[str]:
		r"""
		Asynchronously download a paper from arxiv and save to the user's recent paper directory.
		"""
		if None in [user_id, title, pdf_url]:
			raise ValueError("should provide valid user_id, title, pdf_url to download paper.")

		file_dir, file_name = self._get_default_path(user_id=user_id, title=title)
		file_path = str(Path(file_dir) / file_name)

		if self._verbose:
			print_text(text=f"Downloading paper '{title}' ...", color="pink", end="\n")
		try:
			await adownload_file(url=pdf_url, save_path=file_path)
			return file_path
		except Exception as e:
			print(f"Download failed. Error: {e}")
			return None

	def _get_log(
		self,
		user_id: str,
		succeed_papers: List[Tuple[str, str]],
		fail_papers: List[str]
	) -> OperationOutputLog:
		r""" Get the operation log. """
		logs = []
		if succeed_papers:
			logs.append(f"Successfully download these papers, and restore them in the recent papers of user {user_id}:")

		ref_paper_infos = []

		for title, file_path in succeed_papers:
			download_log = {
				"Title": title,
				"Save path": file_path,
			}
			download_log_str = json.dumps(download_log)
			logs.append(download_log_str)
			paper_info = PaperInfo(
				title=title,
				file_path=file_path,
				possessor=user_id,
			)
			ref_paper_infos.append(paper_info.dumps())

		if fail_papers:
			failed_log = "These paper downloading failed:\n"
			failed_log += "\n".join(fail_papers)
			logs.append(failed_log)
		log_str = "\n\n".join(logs)
		return OperationOutputLog(
			operation_name=self.op_name,
			operation_output=None,
			log_to_user=None,
			log_to_system={
				OP_DESCRIPTION: log_str,
				OP_REFERENCES: ref_paper_infos,
			}
		)

	def do_operation(self, **kwargs) -> OperationOutputLog:
		r"""
		Execute the downloading operation and return the log string.

		Args:
			user_id (str): the user id.
			paper_infos (List[Dict[str, str]]): the metadata of papers,
				for each paper, the `title` and `pdf_url` must be provided

		Returns:
			OperationLog:
				The operation output and log.
		"""
		user_id = kwargs.get("user_id", None)
		paper_infos = kwargs.get("paper_infos", [])

		if None in [user_id, paper_infos]:
			raise ValueError("These arguments must be provided: user_id, paper_infos.")

		if not isinstance(paper_infos, list):
			paper_infos = [paper_infos]

		tmp_paper_store = RecentPaperStore.from_user_id(
			user_id=user_id,
			embed_model=self._embed_model,
		)

		succeed, fail = [], []

		for info in paper_infos:
			pdf_url = info.get("pdf_url", None)
			title = info.get("title", None)
			file_path = self.download_paper(
				user_id=user_id,
				title=title,
				pdf_url=pdf_url,
			)
			if file_path is None:
				fail.append(title)
			else:
				succeed.append((title, file_path))
				tmp_paper_store.put(paper_file_path=file_path)

		tmp_paper_store.persist()
		output_log = self._get_log(user_id=user_id, succeed_papers=succeed, fail_papers=fail)
		return output_log

	async def ado_operation(self, **kwargs) -> OperationOutputLog:
		r"""
		Asynchronously do the downloading operation and return the log string.

		Args:
			user_id (str): the user id.
			paper_infos (List[Dict[str, str]]): the metadata of papers,
				for each paper, the `title` and `pdf_url` must be provided

		Returns:
			str:
				The operation output and log.
		"""
		user_id = kwargs.get("user_id", None)
		paper_infos = kwargs.get("paper_infos", [])

		if None in [user_id, paper_infos]:
			raise ValueError("These arguments must be provided: user_id, paper_infos.")

		if not isinstance(paper_infos, list):
			paper_infos = [paper_infos]

		tmp_paper_store = RecentPaperStore.from_user_id(
			user_id=user_id,
			embed_model=self._embed_model,
		)

		succeed, fail = [], []

		async def single_op(info):
			pdf_url = info.get("pdf_url", None)
			title = info.get("title", None)
			file_path = await self.adownload_paper(
				user_id=user_id,
				title=title,
				pdf_url=pdf_url,
			)
			if file_path is None:
				fail.append(title)
			else:
				succeed.append((title, file_path))
				tmp_paper_store.put(paper_file_path=file_path)

		task_list = tuple([asyncio.create_task(single_op(paper_info)) for paper_info in paper_infos])
		await asyncio.gather(*task_list)
		tmp_paper_store.persist()
		output_log = self._get_log(user_id=user_id, succeed_papers=succeed, fail_papers=fail)
		return output_log

labridge.callback.paper.paper_download.ArxivDownloadOperation.ado_operation(**kwargs) async

Asynchronously do the downloading operation and return the log string.

PARAMETER DESCRIPTION
user_id

the user id.

TYPE: str

paper_infos

the metadata of papers, for each paper, the title and pdf_url must be provided

TYPE: List[Dict[str, str]]

RETURNS DESCRIPTION
str

The operation output and log.

TYPE: OperationOutputLog

Source code in labridge\callback\paper\paper_download.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
async def ado_operation(self, **kwargs) -> OperationOutputLog:
	r"""
	Asynchronously do the downloading operation and return the log string.

	Args:
		user_id (str): the user id.
		paper_infos (List[Dict[str, str]]): the metadata of papers,
			for each paper, the `title` and `pdf_url` must be provided

	Returns:
		str:
			The operation output and log.
	"""
	user_id = kwargs.get("user_id", None)
	paper_infos = kwargs.get("paper_infos", [])

	if None in [user_id, paper_infos]:
		raise ValueError("These arguments must be provided: user_id, paper_infos.")

	if not isinstance(paper_infos, list):
		paper_infos = [paper_infos]

	tmp_paper_store = RecentPaperStore.from_user_id(
		user_id=user_id,
		embed_model=self._embed_model,
	)

	succeed, fail = [], []

	async def single_op(info):
		pdf_url = info.get("pdf_url", None)
		title = info.get("title", None)
		file_path = await self.adownload_paper(
			user_id=user_id,
			title=title,
			pdf_url=pdf_url,
		)
		if file_path is None:
			fail.append(title)
		else:
			succeed.append((title, file_path))
			tmp_paper_store.put(paper_file_path=file_path)

	task_list = tuple([asyncio.create_task(single_op(paper_info)) for paper_info in paper_infos])
	await asyncio.gather(*task_list)
	tmp_paper_store.persist()
	output_log = self._get_log(user_id=user_id, succeed_papers=succeed, fail_papers=fail)
	return output_log

labridge.callback.paper.paper_download.ArxivDownloadOperation.adownload_paper(user_id, title, pdf_url) async

Asynchronously download a paper from arxiv and save to the user's recent paper directory.

Source code in labridge\callback\paper\paper_download.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
async def adownload_paper(self, user_id: str, title: str, pdf_url: str) -> Optional[str]:
	r"""
	Asynchronously download a paper from arxiv and save to the user's recent paper directory.
	"""
	if None in [user_id, title, pdf_url]:
		raise ValueError("should provide valid user_id, title, pdf_url to download paper.")

	file_dir, file_name = self._get_default_path(user_id=user_id, title=title)
	file_path = str(Path(file_dir) / file_name)

	if self._verbose:
		print_text(text=f"Downloading paper '{title}' ...", color="pink", end="\n")
	try:
		await adownload_file(url=pdf_url, save_path=file_path)
		return file_path
	except Exception as e:
		print(f"Download failed. Error: {e}")
		return None

labridge.callback.paper.paper_download.ArxivDownloadOperation.do_operation(**kwargs)

Execute the downloading operation and return the log string.

PARAMETER DESCRIPTION
user_id

the user id.

TYPE: str

paper_infos

the metadata of papers, for each paper, the title and pdf_url must be provided

TYPE: List[Dict[str, str]]

RETURNS DESCRIPTION
OperationLog

The operation output and log.

TYPE: OperationOutputLog

Source code in labridge\callback\paper\paper_download.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def do_operation(self, **kwargs) -> OperationOutputLog:
	r"""
	Execute the downloading operation and return the log string.

	Args:
		user_id (str): the user id.
		paper_infos (List[Dict[str, str]]): the metadata of papers,
			for each paper, the `title` and `pdf_url` must be provided

	Returns:
		OperationLog:
			The operation output and log.
	"""
	user_id = kwargs.get("user_id", None)
	paper_infos = kwargs.get("paper_infos", [])

	if None in [user_id, paper_infos]:
		raise ValueError("These arguments must be provided: user_id, paper_infos.")

	if not isinstance(paper_infos, list):
		paper_infos = [paper_infos]

	tmp_paper_store = RecentPaperStore.from_user_id(
		user_id=user_id,
		embed_model=self._embed_model,
	)

	succeed, fail = [], []

	for info in paper_infos:
		pdf_url = info.get("pdf_url", None)
		title = info.get("title", None)
		file_path = self.download_paper(
			user_id=user_id,
			title=title,
			pdf_url=pdf_url,
		)
		if file_path is None:
			fail.append(title)
		else:
			succeed.append((title, file_path))
			tmp_paper_store.put(paper_file_path=file_path)

	tmp_paper_store.persist()
	output_log = self._get_log(user_id=user_id, succeed_papers=succeed, fail_papers=fail)
	return output_log

labridge.callback.paper.paper_download.ArxivDownloadOperation.download_paper(user_id, title, pdf_url)

Download a paper from arxiv and save to the user's recent paper directory.

PARAMETER DESCRIPTION
user_id

The user id of a lab member.

TYPE: str

title

The paper title.

TYPE: str

pdf_url

The paper URL.

TYPE: str

RETURNS DESCRIPTION
Optional[str]

Optional[str]:

  • If the paper is successfully downloaded, return the file_path.
  • If the downloading fails, return None.
Source code in labridge\callback\paper\paper_download.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def download_paper(self, user_id: str, title: str, pdf_url: str) -> Optional[str]:
	r"""
	Download a paper from arxiv and save to the user's recent paper directory.

	Args:
		user_id (str): The user id of a lab member.
		title (str): The paper title.
		pdf_url (str): The paper URL.

	Returns:
		Optional[str]:

			- If the paper is successfully downloaded, return the file_path.
			- If the downloading fails, return None.
	"""
	if None in [user_id, title, pdf_url]:
		raise ValueError("should provide valid user_id, title, pdf_url to download paper.")
	file_dir, file_name = self._get_default_path(user_id=user_id, title=title)
	result = Result(entry_id="")
	result.pdf_url = pdf_url

	if self._verbose:
		print_text(text=f"Downloading paper '{title}' ...", color="pink", end="\n")

	try:
		result.download_pdf(dirpath=file_dir, filename=file_name)
		file_path = str(Path(file_dir) / file_name)
		return file_path
	except Exception as e:
		print(f"Download failed. Error: {e}")
		return None

labridge.callback.paper.paper_download.ArxivDownloadOperation.operation_description(**kwargs)

Describe the operation.

PARAMETER DESCRIPTION
user_id

the user id.

TYPE: str

paper_infos

the metadata of papers, for each paper, the title must be provided.

TYPE: List[Dict[str, str]]

RETURNS DESCRIPTION
str

the operation description.

Source code in labridge\callback\paper\paper_download.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def operation_description(self, **kwargs) -> str:
	r"""
	Describe the operation.

	Args:
		user_id (str): the user id.
		paper_infos (List[Dict[str, str]]): the metadata of papers,
			for each paper, the `title` must be provided.

	Returns:
		the operation description.
	"""
	user_id = kwargs.get("user_id", None)
	paper_infos = kwargs.get("paper_infos", None)

	if None in [user_id, paper_infos]:
		raise ValueError("should provide valid user_id, paper_infos.")

	papers = []
	for paper in paper_infos:
		title = paper.get("title", None)
		file_dir, file_name = self._get_default_path(user_id=user_id, title=title)
		save_path = str(Path(file_dir) / file_name)
		paper_dsc = PAPER_DESCRIPTION_TMPL.format(title=title, save_path=save_path)
		papers.append(paper_dsc)
	papers = "\n\n".join(papers)
	header = ARXIV_DOWNLOAD_DESCRIPTION.format(user_id=user_id)
	description = f"{header}\n{papers}"
	return description