跳转至

Temporary store

labridge.func_modules.paper.store.temporary_store

labridge.func_modules.paper.store.temporary_store.RecentPaperStore

Bases: object

This class stores the recent papers of a specific user. It is constructed as a tree, with a root node.

Different papers are inserted as child nodes of the root node, the node_id is the absolute file path (in the recent paper warehouse) of the paper.

For each paper node, TextNodes recording paper contents are stored as its child nodes. Like:

                                                                                root_node
                                                                        /                               \
                                                                   /                             \
                                                                Paper1                                  Paper2
                                                /               ...                             \
                                        node_1                                          node_n
PARAMETER DESCRIPTION
vector_index

The vector database storing recent papers.

TYPE: VectorStoreIndex

persist_dir

The persist directory of the vector database.

TYPE: persist_dir

Note

The metadata date and time is recorded in a list format for the convenience of metadata filtering. For example: ['2024-08-10'], ['09:05:03'].

Source code in labridge\func_modules\paper\store\temporary_store.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
class RecentPaperStore(object):
	r"""
	This class stores the recent papers of a specific user.
	It is constructed as a tree, with a root node.

	Different papers are inserted as child nodes of the root node,
	the node_id is the absolute file path (in the recent paper warehouse) of the paper.

	For each paper node, TextNodes recording paper contents are stored as its child nodes.
	Like:

	```
											root_node
										/				\
									   /				 \
									Paper1					Paper2
							/		...				\
						node_1  					node_n
	```

	Args:
		vector_index (VectorStoreIndex): The vector database storing recent papers.
		persist_dir (persist_dir): The persist directory of the vector database.

	Note:
		The metadata `date` and `time` is recorded in a list format for the convenience of metadata filtering.
		For example: ['2024-08-10'], ['09:05:03'].
	"""
	def __init__(
		self,
		vector_index: VectorStoreIndex,
		persist_dir: str
	):
		root = Path(__file__)
		for idx in range(5):
			root = root.parent
		self._root = root
		self.vector_index = vector_index
		self.vector_index.set_index_id(TMP_PAPER_VECTOR_INDEX_ID)
		self.persist_dir = persist_dir
		self._user_id = self.user_id
		self._fs = fsspec.filesystem("file")

	@classmethod
	def from_storage(
		cls,
		persist_dir: str,
		embed_model: BaseEmbedding,
	):
		r"""
		Load from a existing storage.

		Args:
			persist_dir (str): The persist directory of the existing storage.
			embed_model (BaseEmbedding): The used embedding model.

		Returns:
			RecentPaperStore
		"""
		vector_storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
		vector_index = load_index_from_storage(
			storage_context=vector_storage_context,
			index_id=TMP_PAPER_VECTOR_INDEX_ID,
			embed_model=embed_model,
		)
		return cls(
			vector_index=vector_index,
			persist_dir=persist_dir,
		)

	@property
	def user_id(self) -> str:
		r""" Return the user_id of this RecentPaperStore """
		user_id = Path(self.persist_dir).relative_to(self._root / TMP_PAPER_VECTOR_INDEX_PERSIST_DIR)
		return str(user_id)

	@classmethod
	def from_user_id(
		cls,
		user_id: str,
		embed_model: BaseEmbedding,
	):
		r"""
		Construct from a user_id.
		If the corresponding persist_dir of the user does not exist, a new RecentPaperStore will be created for the user.

		Args:
			user_id (str): The user_id of a Lab member.
			embed_model (BaseEmbedding): The used embedding model.

		Returns:
			RecentPaperStore
		"""
		account_manager = AccountManager()

		if user_id not in account_manager.get_users():
			raise ValueError(f"Invalid user id: {user_id}.")

		root = Path(__file__)
		for idx in range(5):
			root = root.parent

		fs = fsspec.filesystem("file")
		paper_dir = str(root / f"{TMP_PAPER_WAREHOUSE_DIR}/{user_id}")
		if not fs.exists(paper_dir):
			fs.mkdirs(paper_dir)

		persist_dir = str(root / f"{TMP_PAPER_VECTOR_INDEX_PERSIST_DIR}/{user_id}")
		if fs.exists(persist_dir):
			return cls.from_storage(
				persist_dir=persist_dir,
				embed_model=embed_model,
			)

		# root node
		root_node = TextNode(
			text=f"Root node for the temporary papers of {user_id}",
			id_=TMP_PAPER_ROOT_NODE_NAME,
		)
		nodes = [root_node]
		vector_index = VectorStoreIndex(
			nodes=nodes,
			embed_model=embed_model,
		)
		return cls(
			vector_index=vector_index,
			persist_dir=persist_dir,
		)

	def _check_valid_paper(self, paper_file_path: str):
		r""" Check whether the paper path is valid. """
		if not self._fs.exists(paper_file_path):
			raise ValueError(f"{paper_file_path} is not a valid file path, it does not exist.")

		suffix = Path(paper_file_path).suffix
		if suffix != ".pdf":
			raise ValueError(f"Only support .pdf format.")

	def check_valid_paper(self, paper_file_path: str):
		r"""
		Check whether the paper path is valid or not.

		1. Whether the paper_file_path exists.
		2. Whether the suffix is `.pdf`.

		Args:
			paper_file_path (str): The paper path.

		Returns:
			None

		Raises:
			ValueError: If the paper_file_path is not valid.
		"""
		self._check_valid_paper(paper_file_path=paper_file_path)

	def _update_node(
		self,
		node_id: str,
		node: BaseNode,
	):
		r""" Update an existing node in vector index. """
		self.vector_index.delete_nodes([node_id])
		self.vector_index.insert_nodes([node])

	def _delete_nodes(self, node_ids: List[str]):
		r""" Delete a node from the vector index. """
		self.vector_index.delete_nodes(node_ids=node_ids)

	def _get_node(self, node_id: str) -> BaseNode:
		r""" Get a node from the vector index according to node_id. """
		return self.vector_index.docstore.get_node(node_id)

	def _get_nodes(self, node_ids: List[str]) -> List[BaseNode]:
		r""" Get nodes from the vector index according to node_ids. """
		return self.vector_index.docstore.get_nodes(node_ids)

	def _default_transformations(self) -> List[TransformComponent]:
		return [SentenceSplitter(chunk_size=1024, chunk_overlap=256, include_metadata=True), ]

	def file_exists(self, file_path: str) -> bool:
		r"""
		Judge whether a paper exists in the RecentPaperStore according to its filename.

		Args:
			file_path (str): The file path of the paper.

		Returns:
			bool: Whether the paper exist or not.
		"""
		file_name = Path(file_path).name
		user_papers_dir = self._root / f"{TMP_PAPER_WAREHOUSE_DIR}/{self.user_id}"
		paper_file_path = str(user_papers_dir / file_name)

		try:
			self._get_node(node_id=paper_file_path)
			return True
		except ValueError:
			return False

	def put(self, paper_file_path: str, extra_metadata: dict = None):
		r"""
		put a new paper into the vector index.

		Args:
			paper_file_path (str): The absolute path of the paper.
			extra_metadata (dict): Extra metadata of the paper.
				For example, if the paper is downloaded from arXiv,
				much structured information will be provided by the downloader.

		Returns:
			None
		"""
		try:
			self._check_valid_paper(paper_file_path=paper_file_path)
		except ValueError:
			return

		file_name = Path(paper_file_path).name
		user_papers_dir = self._root / f"{TMP_PAPER_WAREHOUSE_DIR}/{self.user_id}"
		store_file_path = str(user_papers_dir / file_name)

		try:
			_ = self._get_node(node_id=store_file_path)
			print(f"{store_file_path} already exists in the temporary papers of user {self._user_id}.")
			return
		except ValueError:
			pass

		if str(Path(paper_file_path).parent) != str(user_papers_dir):
			self._fs.cp(paper_file_path, str(user_papers_dir))

		root_node = self._get_node(node_id=TMP_PAPER_ROOT_NODE_NAME)
		papers = root_node.child_nodes or []

		date, h_m_s = get_time()
		paper_node = TextNode(
			id_=store_file_path,
			text=f"The paper {store_file_path}",
			metadata={
				TMP_PAPER_DATE: [date,],
				TMP_PAPER_TIME: [h_m_s,],
			}
		)
		papers.append(RelatedNodeInfo(node_id=paper_node.node_id))
		root_node.relationships[NodeRelationship.CHILD] = papers
		self._update_node(node_id=TMP_PAPER_ROOT_NODE_NAME, node=root_node)

		# read the paper:
		reader = SimpleDirectoryReader(
			input_files=[store_file_path],
			file_metadata=tmp_paper_get_file_metadata,
			filename_as_id=True,
		)
		documents = reader.load_data()

		for doc in documents:
			self.vector_index.docstore.set_document_hash(doc.get_doc_id(), doc.hash)

		doc_nodes = run_transformations(
			nodes=documents,
			transformations=self._default_transformations()
		)

		child_nodes = []
		for doc_node in doc_nodes:
			child_nodes.append(RelatedNodeInfo(node_id=doc_node.node_id))
			doc_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(node_id=paper_node.node_id)
			new_metadata = {
				TMP_PAPER_NODE_TYPE_KEY: TMP_PAPER_DOC_NODE_TYPE,
				TMP_PAPER_DATE: [date],
				TMP_PAPER_TIME: [h_m_s],
			}
			if extra_metadata:
				new_metadata.update(extra_metadata)

			doc_node.metadata.update(new_metadata)
			doc_node.excluded_llm_metadata_keys.append(TMP_PAPER_NODE_TYPE_KEY)
			doc_node.excluded_embed_metadata_keys.append(TMP_PAPER_NODE_TYPE_KEY)

		paper_node.relationships[NodeRelationship.CHILD] = child_nodes
		nodes = doc_nodes + [paper_node]
		self.vector_index.insert_nodes(nodes=nodes)

	def get_summary_node(self, paper_file_path: str) -> Optional[BaseNode]:
		r"""
		Get the summary node of a paper.

		Args:
			paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.

		Returns:
			Optional[BaseNode]: The summary node the paper. If it does not exist, return None.
		"""
		summary_id = f"{TMP_PAPER_SUMMARY_NODE_PREFIX}{paper_file_path}"
		try:
			summary_node = self._get_node(node_id=summary_id)
			return summary_node
		except Exception as e:
			print(f"Summary node of {paper_file_path} does not exist. {e}")
			return None

	def get_paper_node(self, paper_file_path: str) -> Optional[BaseNode]:
		r"""
		Get the paper_node of a paper.

		Args:
			paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.

		Returns:
			Optional[BaseNode]: The paper node.

		Raises:
			ValueError: If the paper node does not exist.
		"""
		self._check_valid_paper(paper_file_path=paper_file_path)
		try:
			paper_node = self._get_node(node_id=paper_file_path)
			return paper_node
		except Exception:
			raise ValueError(f"{paper_file_path} does not exists in the temporary papers of user {self._user_id}.")

	def insert_summary_node(self, paper_file_path: str, summary_node: TextNode):
		self._check_valid_paper(paper_file_path=paper_file_path)
		paper_node = self.get_paper_node(paper_file_path=paper_file_path)

		summary_node.id_ = f"{TMP_PAPER_SUMMARY_NODE_PREFIX}{paper_file_path}"
		summary_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(node_id=paper_node.node_id)

		paper_docs = paper_node.child_nodes
		paper_docs.append(
			RelatedNodeInfo(node_id=summary_node.node_id)
		)
		doc_node = self._get_node(node_id=paper_docs[0].node_id)
		summary_node.metadata.update(doc_node.metadata)

		paper_node.relationships[NodeRelationship.CHILD] = paper_docs
		self._update_node(node_id=paper_node.node_id, node=paper_node)
		self.vector_index.insert_nodes(nodes=[summary_node])

	def get_paper_nodes(self, paper_file_path: str) -> Optional[List[BaseNode]]:
		r"""
		Get the doc nodes of a paper.

		Args:
			paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.

		Returns:
			Optional[List[BaseNode]]: The doc nodes of the paper.

		Raises:
			ValueError: If the paper does not exist.

		"""
		self._check_valid_paper(paper_file_path=paper_file_path)
		paper_node = self.get_paper_node(paper_file_path=paper_file_path)
		doc_nodes = paper_node.child_nodes
		doc_ids = [node.node_id for node in doc_nodes]
		paper_nodes = self._get_nodes(node_ids=doc_ids)
		return paper_nodes

	def get_all_relevant_node_ids(self, node_ids: List[str]) -> Optional[List[str]]:
		r"""
		Get all the ids of the nodes that are belong to the same papers with the input node_ids.

		Args:
			node_ids (List[str]): The node ids.

		Returns:
			Optional[List[str]]: The relevant doc nodes. If no relevant node exists, return None.
		"""
		paper_ids = set()
		for node_id in node_ids:
			try:
				node = self._get_node(node_id=node_id)
				paper_id = node.parent_node.node_id
				paper_ids.add(paper_id)
			except Exception:
				continue
		if len(paper_ids) < 1:
			return None

		all_ids = []
		for paper_id in paper_ids:
			paper_nodes = self.get_paper_nodes(paper_file_path=paper_id)
			all_ids.extend([node.node_id for node in paper_nodes])
		return all_ids

	def delete(self, paper_file_path: str):
		r"""
		Delete a paper from the recent paper vector index and the recent paper warehouse.

		Args:
			paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.
		"""
		self._check_valid_paper(paper_file_path=paper_file_path)
		paper_node = self.get_paper_node(paper_file_path=paper_file_path)
		doc_nodes = paper_node.child_nodes
		delete_ids = [paper_node.node_id]
		delete_ids.extend([doc_node.node_id for doc_node in doc_nodes])
		self._delete_nodes(node_ids=delete_ids)

		root_node = self._get_node(node_id=TMP_PAPER_ROOT_NODE_NAME)
		papers = root_node.child_nodes
		for paper in papers:
			if paper.node_id == paper_file_path:
				papers.remove(paper)
		root_node.relationships[NodeRelationship.CHILD] = papers
		self._update_node(node_id=TMP_PAPER_ROOT_NODE_NAME, node=root_node)
		try:
			Path(paper_file_path).relative_to(TMP_PAPER_WAREHOUSE_DIR)
			self._fs.rm(paper_file_path)
		except ValueError:
			pass

	def persist(self, persist_dir: str = None):
		r"""
		Persis to the disk.

		Args:
			persist_dir (str): The save directory. Defaults to `self.persist_dir`
		"""
		persist_dir = persist_dir or self.persist_dir
		if not self._fs.exists(persist_dir):
			self._fs.makedirs(persist_dir)
		self.vector_index.storage_context.persist(persist_dir=persist_dir)

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.user_id: str property

Return the user_id of this RecentPaperStore

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.check_valid_paper(paper_file_path)

Check whether the paper path is valid or not.

  1. Whether the paper_file_path exists.
  2. Whether the suffix is .pdf.
PARAMETER DESCRIPTION
paper_file_path

The paper path.

TYPE: str

RETURNS DESCRIPTION

None

RAISES DESCRIPTION
ValueError

If the paper_file_path is not valid.

Source code in labridge\func_modules\paper\store\temporary_store.py
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
def check_valid_paper(self, paper_file_path: str):
	r"""
	Check whether the paper path is valid or not.

	1. Whether the paper_file_path exists.
	2. Whether the suffix is `.pdf`.

	Args:
		paper_file_path (str): The paper path.

	Returns:
		None

	Raises:
		ValueError: If the paper_file_path is not valid.
	"""
	self._check_valid_paper(paper_file_path=paper_file_path)

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.delete(paper_file_path)

Delete a paper from the recent paper vector index and the recent paper warehouse.

PARAMETER DESCRIPTION
paper_file_path

The file path of the paper, equally the node_id of the paper_node.

TYPE: str

Source code in labridge\func_modules\paper\store\temporary_store.py
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def delete(self, paper_file_path: str):
	r"""
	Delete a paper from the recent paper vector index and the recent paper warehouse.

	Args:
		paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.
	"""
	self._check_valid_paper(paper_file_path=paper_file_path)
	paper_node = self.get_paper_node(paper_file_path=paper_file_path)
	doc_nodes = paper_node.child_nodes
	delete_ids = [paper_node.node_id]
	delete_ids.extend([doc_node.node_id for doc_node in doc_nodes])
	self._delete_nodes(node_ids=delete_ids)

	root_node = self._get_node(node_id=TMP_PAPER_ROOT_NODE_NAME)
	papers = root_node.child_nodes
	for paper in papers:
		if paper.node_id == paper_file_path:
			papers.remove(paper)
	root_node.relationships[NodeRelationship.CHILD] = papers
	self._update_node(node_id=TMP_PAPER_ROOT_NODE_NAME, node=root_node)
	try:
		Path(paper_file_path).relative_to(TMP_PAPER_WAREHOUSE_DIR)
		self._fs.rm(paper_file_path)
	except ValueError:
		pass

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.file_exists(file_path)

Judge whether a paper exists in the RecentPaperStore according to its filename.

PARAMETER DESCRIPTION
file_path

The file path of the paper.

TYPE: str

RETURNS DESCRIPTION
bool

Whether the paper exist or not.

TYPE: bool

Source code in labridge\func_modules\paper\store\temporary_store.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
def file_exists(self, file_path: str) -> bool:
	r"""
	Judge whether a paper exists in the RecentPaperStore according to its filename.

	Args:
		file_path (str): The file path of the paper.

	Returns:
		bool: Whether the paper exist or not.
	"""
	file_name = Path(file_path).name
	user_papers_dir = self._root / f"{TMP_PAPER_WAREHOUSE_DIR}/{self.user_id}"
	paper_file_path = str(user_papers_dir / file_name)

	try:
		self._get_node(node_id=paper_file_path)
		return True
	except ValueError:
		return False

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.from_storage(persist_dir, embed_model) classmethod

Load from a existing storage.

PARAMETER DESCRIPTION
persist_dir

The persist directory of the existing storage.

TYPE: str

embed_model

The used embedding model.

TYPE: BaseEmbedding

RETURNS DESCRIPTION

RecentPaperStore

Source code in labridge\func_modules\paper\store\temporary_store.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
@classmethod
def from_storage(
	cls,
	persist_dir: str,
	embed_model: BaseEmbedding,
):
	r"""
	Load from a existing storage.

	Args:
		persist_dir (str): The persist directory of the existing storage.
		embed_model (BaseEmbedding): The used embedding model.

	Returns:
		RecentPaperStore
	"""
	vector_storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
	vector_index = load_index_from_storage(
		storage_context=vector_storage_context,
		index_id=TMP_PAPER_VECTOR_INDEX_ID,
		embed_model=embed_model,
	)
	return cls(
		vector_index=vector_index,
		persist_dir=persist_dir,
	)

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.from_user_id(user_id, embed_model) classmethod

Construct from a user_id. If the corresponding persist_dir of the user does not exist, a new RecentPaperStore will be created for the user.

PARAMETER DESCRIPTION
user_id

The user_id of a Lab member.

TYPE: str

embed_model

The used embedding model.

TYPE: BaseEmbedding

RETURNS DESCRIPTION

RecentPaperStore

Source code in labridge\func_modules\paper\store\temporary_store.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
@classmethod
def from_user_id(
	cls,
	user_id: str,
	embed_model: BaseEmbedding,
):
	r"""
	Construct from a user_id.
	If the corresponding persist_dir of the user does not exist, a new RecentPaperStore will be created for the user.

	Args:
		user_id (str): The user_id of a Lab member.
		embed_model (BaseEmbedding): The used embedding model.

	Returns:
		RecentPaperStore
	"""
	account_manager = AccountManager()

	if user_id not in account_manager.get_users():
		raise ValueError(f"Invalid user id: {user_id}.")

	root = Path(__file__)
	for idx in range(5):
		root = root.parent

	fs = fsspec.filesystem("file")
	paper_dir = str(root / f"{TMP_PAPER_WAREHOUSE_DIR}/{user_id}")
	if not fs.exists(paper_dir):
		fs.mkdirs(paper_dir)

	persist_dir = str(root / f"{TMP_PAPER_VECTOR_INDEX_PERSIST_DIR}/{user_id}")
	if fs.exists(persist_dir):
		return cls.from_storage(
			persist_dir=persist_dir,
			embed_model=embed_model,
		)

	# root node
	root_node = TextNode(
		text=f"Root node for the temporary papers of {user_id}",
		id_=TMP_PAPER_ROOT_NODE_NAME,
	)
	nodes = [root_node]
	vector_index = VectorStoreIndex(
		nodes=nodes,
		embed_model=embed_model,
	)
	return cls(
		vector_index=vector_index,
		persist_dir=persist_dir,
	)

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.get_all_relevant_node_ids(node_ids)

Get all the ids of the nodes that are belong to the same papers with the input node_ids.

PARAMETER DESCRIPTION
node_ids

The node ids.

TYPE: List[str]

RETURNS DESCRIPTION
Optional[List[str]]

Optional[List[str]]: The relevant doc nodes. If no relevant node exists, return None.

Source code in labridge\func_modules\paper\store\temporary_store.py
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
def get_all_relevant_node_ids(self, node_ids: List[str]) -> Optional[List[str]]:
	r"""
	Get all the ids of the nodes that are belong to the same papers with the input node_ids.

	Args:
		node_ids (List[str]): The node ids.

	Returns:
		Optional[List[str]]: The relevant doc nodes. If no relevant node exists, return None.
	"""
	paper_ids = set()
	for node_id in node_ids:
		try:
			node = self._get_node(node_id=node_id)
			paper_id = node.parent_node.node_id
			paper_ids.add(paper_id)
		except Exception:
			continue
	if len(paper_ids) < 1:
		return None

	all_ids = []
	for paper_id in paper_ids:
		paper_nodes = self.get_paper_nodes(paper_file_path=paper_id)
		all_ids.extend([node.node_id for node in paper_nodes])
	return all_ids

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.get_paper_node(paper_file_path)

Get the paper_node of a paper.

PARAMETER DESCRIPTION
paper_file_path

The file path of the paper, equally the node_id of the paper_node.

TYPE: str

RETURNS DESCRIPTION
Optional[BaseNode]

Optional[BaseNode]: The paper node.

RAISES DESCRIPTION
ValueError

If the paper node does not exist.

Source code in labridge\func_modules\paper\store\temporary_store.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
def get_paper_node(self, paper_file_path: str) -> Optional[BaseNode]:
	r"""
	Get the paper_node of a paper.

	Args:
		paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.

	Returns:
		Optional[BaseNode]: The paper node.

	Raises:
		ValueError: If the paper node does not exist.
	"""
	self._check_valid_paper(paper_file_path=paper_file_path)
	try:
		paper_node = self._get_node(node_id=paper_file_path)
		return paper_node
	except Exception:
		raise ValueError(f"{paper_file_path} does not exists in the temporary papers of user {self._user_id}.")

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.get_paper_nodes(paper_file_path)

Get the doc nodes of a paper.

PARAMETER DESCRIPTION
paper_file_path

The file path of the paper, equally the node_id of the paper_node.

TYPE: str

RETURNS DESCRIPTION
Optional[List[BaseNode]]

Optional[List[BaseNode]]: The doc nodes of the paper.

RAISES DESCRIPTION
ValueError

If the paper does not exist.

Source code in labridge\func_modules\paper\store\temporary_store.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def get_paper_nodes(self, paper_file_path: str) -> Optional[List[BaseNode]]:
	r"""
	Get the doc nodes of a paper.

	Args:
		paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.

	Returns:
		Optional[List[BaseNode]]: The doc nodes of the paper.

	Raises:
		ValueError: If the paper does not exist.

	"""
	self._check_valid_paper(paper_file_path=paper_file_path)
	paper_node = self.get_paper_node(paper_file_path=paper_file_path)
	doc_nodes = paper_node.child_nodes
	doc_ids = [node.node_id for node in doc_nodes]
	paper_nodes = self._get_nodes(node_ids=doc_ids)
	return paper_nodes

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.get_summary_node(paper_file_path)

Get the summary node of a paper.

PARAMETER DESCRIPTION
paper_file_path

The file path of the paper, equally the node_id of the paper_node.

TYPE: str

RETURNS DESCRIPTION
Optional[BaseNode]

Optional[BaseNode]: The summary node the paper. If it does not exist, return None.

Source code in labridge\func_modules\paper\store\temporary_store.py
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def get_summary_node(self, paper_file_path: str) -> Optional[BaseNode]:
	r"""
	Get the summary node of a paper.

	Args:
		paper_file_path (str): The file path of the paper, equally the node_id of the paper_node.

	Returns:
		Optional[BaseNode]: The summary node the paper. If it does not exist, return None.
	"""
	summary_id = f"{TMP_PAPER_SUMMARY_NODE_PREFIX}{paper_file_path}"
	try:
		summary_node = self._get_node(node_id=summary_id)
		return summary_node
	except Exception as e:
		print(f"Summary node of {paper_file_path} does not exist. {e}")
		return None

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.persist(persist_dir=None)

Persis to the disk.

PARAMETER DESCRIPTION
persist_dir

The save directory. Defaults to self.persist_dir

TYPE: str DEFAULT: None

Source code in labridge\func_modules\paper\store\temporary_store.py
479
480
481
482
483
484
485
486
487
488
489
def persist(self, persist_dir: str = None):
	r"""
	Persis to the disk.

	Args:
		persist_dir (str): The save directory. Defaults to `self.persist_dir`
	"""
	persist_dir = persist_dir or self.persist_dir
	if not self._fs.exists(persist_dir):
		self._fs.makedirs(persist_dir)
	self.vector_index.storage_context.persist(persist_dir=persist_dir)

labridge.func_modules.paper.store.temporary_store.RecentPaperStore.put(paper_file_path, extra_metadata=None)

put a new paper into the vector index.

PARAMETER DESCRIPTION
paper_file_path

The absolute path of the paper.

TYPE: str

extra_metadata

Extra metadata of the paper. For example, if the paper is downloaded from arXiv, much structured information will be provided by the downloader.

TYPE: dict DEFAULT: None

RETURNS DESCRIPTION

None

Source code in labridge\func_modules\paper\store\temporary_store.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
def put(self, paper_file_path: str, extra_metadata: dict = None):
	r"""
	put a new paper into the vector index.

	Args:
		paper_file_path (str): The absolute path of the paper.
		extra_metadata (dict): Extra metadata of the paper.
			For example, if the paper is downloaded from arXiv,
			much structured information will be provided by the downloader.

	Returns:
		None
	"""
	try:
		self._check_valid_paper(paper_file_path=paper_file_path)
	except ValueError:
		return

	file_name = Path(paper_file_path).name
	user_papers_dir = self._root / f"{TMP_PAPER_WAREHOUSE_DIR}/{self.user_id}"
	store_file_path = str(user_papers_dir / file_name)

	try:
		_ = self._get_node(node_id=store_file_path)
		print(f"{store_file_path} already exists in the temporary papers of user {self._user_id}.")
		return
	except ValueError:
		pass

	if str(Path(paper_file_path).parent) != str(user_papers_dir):
		self._fs.cp(paper_file_path, str(user_papers_dir))

	root_node = self._get_node(node_id=TMP_PAPER_ROOT_NODE_NAME)
	papers = root_node.child_nodes or []

	date, h_m_s = get_time()
	paper_node = TextNode(
		id_=store_file_path,
		text=f"The paper {store_file_path}",
		metadata={
			TMP_PAPER_DATE: [date,],
			TMP_PAPER_TIME: [h_m_s,],
		}
	)
	papers.append(RelatedNodeInfo(node_id=paper_node.node_id))
	root_node.relationships[NodeRelationship.CHILD] = papers
	self._update_node(node_id=TMP_PAPER_ROOT_NODE_NAME, node=root_node)

	# read the paper:
	reader = SimpleDirectoryReader(
		input_files=[store_file_path],
		file_metadata=tmp_paper_get_file_metadata,
		filename_as_id=True,
	)
	documents = reader.load_data()

	for doc in documents:
		self.vector_index.docstore.set_document_hash(doc.get_doc_id(), doc.hash)

	doc_nodes = run_transformations(
		nodes=documents,
		transformations=self._default_transformations()
	)

	child_nodes = []
	for doc_node in doc_nodes:
		child_nodes.append(RelatedNodeInfo(node_id=doc_node.node_id))
		doc_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(node_id=paper_node.node_id)
		new_metadata = {
			TMP_PAPER_NODE_TYPE_KEY: TMP_PAPER_DOC_NODE_TYPE,
			TMP_PAPER_DATE: [date],
			TMP_PAPER_TIME: [h_m_s],
		}
		if extra_metadata:
			new_metadata.update(extra_metadata)

		doc_node.metadata.update(new_metadata)
		doc_node.excluded_llm_metadata_keys.append(TMP_PAPER_NODE_TYPE_KEY)
		doc_node.excluded_embed_metadata_keys.append(TMP_PAPER_NODE_TYPE_KEY)

	paper_node.relationships[NodeRelationship.CHILD] = child_nodes
	nodes = doc_nodes + [paper_node]
	self.vector_index.insert_nodes(nodes=nodes)

labridge.func_modules.paper.store.temporary_store.tmp_paper_get_file_metadata(file_path)

Record these metadata in each doc node:

  • the absolute file path of the paper.
  • the date when the file is put in.
  • the time when the file is put in.
Source code in labridge\func_modules\paper\store\temporary_store.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def tmp_paper_get_file_metadata(file_path: str) -> Dict[str, Any]:
	r"""
	Record these metadata in each doc node:

	- the absolute file path of the paper.
	- the date when the file is put in.
	- the time when the file is put in.
	"""
	date, h_m_s = get_time()
	metadata = {
		TMP_PAPER_FILE_PATH_KEY: file_path,
		TMP_PAPER_DATE: [date,],
		TMP_PAPER_TIME: [h_m_s,],
	}
	return metadata