跳转至

Paper store

labridge.func_modules.paper.store.paper_store

labridge.func_modules.paper.store.paper_store.PaperDirectorySummaryStore

This class is used to store the summary of each paper directory. It is useful for storing new papers in proper directories, recommending papers to lab members, etc.

Initially, the directory summary store is automatically constructed using LLM to summarize each directory. However, it is not accurate enough, it should be updated according to the relevant research fields information provided by Lab members.

Before storing directory summaries, make sure that all target papers have been added to the paper warehouse and stored in the PaperStorage.

Each directory summary node is stored in the docstore, two items are recorded:

  1. the possessor of this directory.
  2. the summary (relevant research fields) of this directory.

These two items are stored as metadata of th summary node.

PARAMETER DESCRIPTION
llm

the used llm.

TYPE: LLM DEFAULT: None

embed_model

the used embed model.

TYPE: BaseEmbedding DEFAULT: None

paper_root

the directory root of the paper warehouse.

TYPE: str DEFAULT: None

paper_summary_persist_dir

the directory storing the paper summary index.

TYPE: str DEFAULT: None

directory_summary_persist_dir

the directory storing the directory summary index.

TYPE: str DEFAULT: None

service_context

service_context

TYPE: ServiceContext DEFAULT: None

dir_choice_batch_size

TYPE: int DEFAULT: 5

Source code in labridge\func_modules\paper\store\paper_store.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
class PaperDirectorySummaryStore:
	r"""
	This class is used to store the summary of each paper directory.
	It is useful for storing new papers in proper directories, recommending papers to lab members, etc.

	Initially, the directory summary store is automatically constructed using LLM to summarize each directory.
	However, it is not accurate enough, it should be updated according to the relevant research fields information
	provided by Lab members.

	Before storing directory summaries, make sure that all target papers have been added to the paper warehouse and
	stored in the `PaperStorage`.

	Each directory summary node is stored in the docstore, two items are recorded:

	1. the possessor of this directory.
	2. the summary (relevant research fields) of this directory.

	These two items are stored as metadata of th summary node.

	Args:
		llm (LLM): the used llm.
		embed_model (BaseEmbedding): the used embed model.
		paper_root (str): the directory root of the paper warehouse.
		paper_summary_persist_dir (str): the directory storing the paper summary index.
		directory_summary_persist_dir (str): the directory storing the directory summary index.
		service_context (ServiceContext): service_context
		dir_choice_batch_size (int):
	"""
	def __init__(
		self,
		llm: Optional[LLM] = None,
		embed_model: Optional[BaseEmbedding] = None,
		paper_root: Union[os.PathLike, str] = None,
		paper_summary_persist_dir: Union[str, os.PathLike] = None,
		directory_summary_persist_dir: Union[str, os.PathLike] = None,
		service_context: Optional[ServiceContext] = None,
		dir_choice_batch_size: int = 5,
	):
		root = Path(__file__)
		for i in range(5):
			root = root.parent
		self.root = root

		self.llm = llm or llm_from_settings_or_context(Settings, service_context)
		self.embed_model = embed_model or embed_model_from_settings_or_context(Settings, service_context)
		self.service_context = service_context
		self.paper_root = self._path_format(
			path=paper_root,
			default=root / DEFAULT_PAPER_WAREHOUSE_DIR,
		)
		self.paper_summary_persist_dir = self._path_format(
			path=paper_summary_persist_dir,
			default=root / DEFAULT_PAPER_SUMMARY_PERSIST_DIR,
		)
		self.directory_summary_persist_dir = self._path_format(
			path=directory_summary_persist_dir,
			default=root / DEFAULT_DIRECTORY_SUMMARY_PERSIST_DIR,
		)
		if not Path(self.directory_summary_persist_dir).exists():
			self._auto_construct()
		directory_storage_context = StorageContext.from_defaults(persist_dir=self.directory_summary_persist_dir)
		self.directory_summary_index = load_index_from_storage(
			storage_context=directory_storage_context,
			index_id=DIR_SUMMARY_INDEX_ID,
			service_context=self.service_context,
		)
		self.dir_choice_batch_size = dir_choice_batch_size


	def _path_format(self, path: Union[os.PathLike, str], default: Path) -> str:
		if path is None:
			return str(default)
		return path

	def _auto_summarize_dir(self, directory: str, verbose: bool = False):
		r"""
		Automatically summarize each directory under the given directory.
		The given directory must be under the paper root.
		"""
		if directory != self.paper_root and Path(self.paper_root) not in Path(directory).parents:
			raise ValueError("Invalid directory. The input directory should be under the paper warehouse.")

		paper_summary_storage_context = StorageContext.from_defaults(persist_dir=self.paper_summary_persist_dir)
		paper_summary_index = load_index_from_storage(
			storage_context=paper_summary_storage_context,
			index_id=PAPER_SUMMARY_INDEX_ID,
			service_context=self.service_context,
		)
		doc_id_to_summary_id = paper_summary_index.index_struct.doc_id_to_summary_id

		if not Path(self.directory_summary_persist_dir).exists():
			rel_paper_root = Path(self.paper_root).relative_to(self.root)
			root_node = TextNode(text="", id_=str(rel_paper_root), )
			root_node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(node_id="Paper warehouse", )
			dir_summary_index = DocumentSummaryIndex(
				nodes=[root_node, ],
				llm=self.llm,
				embed_model=self.embed_model,
				service_context=self.service_context,
				response_synthesizer=get_response_synthesizer(
					llm=self.llm,
					response_mode=ResponseMode.COMPACT_ACCUMULATE
				),
			)
		else:
			directory_storage_context = StorageContext.from_defaults(persist_dir=self.directory_summary_persist_dir)
			dir_summary_index = load_index_from_storage(
				storage_context=directory_storage_context,
				index_id=DIR_SUMMARY_INDEX_ID,
				service_context=self.service_context,
			)
		dir_id_to_summary_id = dir_summary_index.index_struct.doc_id_to_summary_id

		def dfs(current_dir: Path):
			if not current_dir.is_dir():
				return

			for child in current_dir.iterdir():
				dfs(child)

			nodes = []
			current_dir_id = str(current_dir.relative_to(self.root))
			if current_dir_id == DEFAULT_PAPER_WAREHOUSE_DIR:
				return

			possessor = current_dir_id.split('/')[2]
			print_text(f">>> Processing: {current_dir}", color="blue", end="\n")
			for child in current_dir.iterdir():
				if not child.is_dir() and child.suffix == ".pdf":
					rel_paper = str(child.relative_to(self.root))
					child_main_text = rel_paper + f"_{MAINTEXT}"
					child_methods = rel_paper + f"_{METHODS}"
					for doc_id in (child_main_text, child_methods):
						if doc_id not in doc_id_to_summary_id.keys() and verbose:
							print(f"{doc_id} not stored into the PaperStorage yet, "
								  f"please insert it into the PaperStorage first.")
						if doc_id in doc_id_to_summary_id.keys():
							summary_id = doc_id_to_summary_id[doc_id]
							paper_summary_node = paper_summary_index.docstore.get_node(summary_id)
							# Get the paper keywords
							if PAPER_LEVEL_KEYWORDS in paper_summary_node.metadata.keys():
								paper_keywords = paper_summary_node.metadata[PAPER_LEVEL_KEYWORDS]
							else:
								# extract keywords.
								paper_keywords = dir_summary_index._response_synthesizer.synthesize(
									query=PAPER_KEYWORDS_EXTRACT_QUERY,
									nodes=[NodeWithScore(node=paper_summary_node)]
								)

							# filter metadata (possessor & paper keywords)
							paper_summary_node.metadata = {
								PAPER_POSSESSOR: possessor,
								PAPER_LEVEL_KEYWORDS: paper_keywords,
							}
							paper_summary_node.set_content("")
							paper_summary_node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
								node_id=current_dir_id,
							)
							nodes.append(paper_summary_node)
				elif child.is_dir():
					child_dir_id = str(child.relative_to(self.root))
					if child_dir_id in dir_id_to_summary_id.keys():
						child_summary_id = dir_id_to_summary_id[child_dir_id]
						dir_summary_node = dir_summary_index.docstore.get_node(child_summary_id)
						dir_summary_node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
							node_id=current_dir_id,
						)
						nodes.append(dir_summary_node)

			# Summarize current directory based on its children
			nodes_with_scores = [NodeWithScore(node=n) for n in nodes]

			if len(nodes_with_scores) > 0:
				summary_response = dir_summary_index._response_synthesizer.synthesize(
					query=DIR_SUMMARIZE_QUERY,
					nodes=nodes_with_scores,
				)

				summary_response = cast(Response, summary_response)
				dir_summary_node = TextNode(
					text="",
					relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id=current_dir_id)},
					metadata={
						PAPER_POSSESSOR: possessor,
						PAPER_LEVEL_KEYWORDS: summary_response.response,
					},
				)

				dir_summary_index.docstore.add_documents([dir_summary_node])
				dir_summary_index._index_struct.doc_id_to_summary_id[current_dir_id] = dir_summary_node.node_id

				id_to_embed_map = embed_nodes([dir_summary_node,], self.embed_model)
				node_with_embedding = dir_summary_node.copy()
				node_with_embedding.embedding = id_to_embed_map[dir_summary_node.node_id]
				dir_summary_index._vector_store.add([node_with_embedding, ])
				dir_summary_index._storage_context.index_store.add_index_struct(dir_summary_index._index_struct)

		dfs(Path(directory))
		if dir_summary_index.index_id != DIR_SUMMARY_INDEX_ID:
			dir_summary_index.set_index_id(DIR_SUMMARY_INDEX_ID)
		dir_summary_index.storage_context.persist(persist_dir=str(self.directory_summary_persist_dir))

	def _auto_construct(self):
		r"""
		Automatically construct the directory summary index based on the paper warehouse.

		DFS the directory tree, directory root: `self.paper_root`.
		The summary (relevant research fields) of each directory is synthesized from its child directories.

		Each summary node of a directory: ref_doc_id: the directory path relative to the root.
		"""
		self._auto_summarize_dir(self.paper_root)

	def get_dir_nodes(self):
		r""" get the valid directory summary nodes """
		dir_id_to_summary_id = self.directory_summary_index._index_struct.doc_id_to_summary_id
		dir_summary_nodes = []
		for dir_id in dir_id_to_summary_id.keys():
			dir_path = self.root / dir_id
			if dir_path.exists():
				summary_id = dir_id_to_summary_id[dir_id]
				summary_node = self.directory_summary_index.docstore.get_node(summary_id)
				dir_summary_nodes.append(summary_node)
		return dir_summary_nodes

	def match_directory_for_new_paper(
		self,
		pdf_path: str,
		possessor: str,
		paper_summary: str = None,
		verbose: bool=False,
	) -> Union[str, None]:
		r"""
		select the most relevant (and deepest) directory for the new paper.

		Args:
			pdf_path (str): the path of the new paper.
			possessor (str): the possessor of this new paper.
			paper_summary (str): the summary of the new paper.
			verbose (bool): whether to show progress.

		Returns:
			Union[str, None]:
				The matched directory for the new paper. If no proper directory found, return None.
		"""
		pdf_path = Path(pdf_path)
		if pdf_path.suffix != ".pdf":
			raise ValueError("Only papers with PDF format are supported now.")
		possessor_dir = Path(self.paper_root) / possessor
		if not possessor_dir.exists():
			raise ValueError(f"The member {possessor} do not exist. Please sign up as a member first.")

		if paper_summary is None:
			pdf_docs = PyMuPDFReader().load_data(file_path=pdf_path)
			# typically, the first page includes conclusive information of a paper.
			paper_summary = pdf_docs[0].text

		dir_summary_nodes = self.get_dir_nodes()

		selected_nodes = []
		selected_relevances = []

		for idx in range(0, len(dir_summary_nodes), self.dir_choice_batch_size):
			summary_nodes = dir_summary_nodes[idx: idx + self.dir_choice_batch_size]
			dir_context_str = default_format_node_batch_fn(summary_nodes=summary_nodes)

			raw_response = self.llm.predict(
				DIR_CHOICE_SELECT_PROMPT,
				dir_context_str=dir_context_str,
				paper_str=paper_summary,
			)
			raw_choices, relevances = default_parse_choice_select_answer_fn(raw_response, len(summary_nodes))
			choice_idxs = [choice - 1 for choice in raw_choices]
			choice_summary_nodes = [summary_nodes[ci] for ci in choice_idxs]
			selected_nodes.extend(choice_summary_nodes)
			selected_relevances.extend(relevances)

		if len(selected_nodes) == 0:
			return None

		zipped_list = list(zip(selected_nodes, selected_relevances))
		sorted_list = sorted(zipped_list, key=lambda x: x[1], reverse=True)
		# choose the most relevant and the deepest directory.
		best_dir = sorted_list[0][0].ref_doc_id

		if verbose:
			for node, relevance in sorted_list:
				print_text(f">>> dir: {node.ref_doc_id}, relevance: {relevance}", color="blue", end="\n")
		def sub_dir_nodes(paper_dir: str):
			sub_nodes_with_score = []
			for node, score in sorted_list:
				if Path(paper_dir) in Path(node.ref_doc_id).parents:
					sub_nodes_with_score.append((node, score))
			return sub_nodes_with_score

		sub_list = sub_dir_nodes(best_dir)
		while len(sub_list) > 0:
			sub_list = sorted(sub_list, key=lambda x: x[1], reverse=True)
			best_dir = sub_list[0][0].ref_doc_id
			sub_list = sub_dir_nodes(best_dir)
		return best_dir

	def update(self, dir_description_dict: Dict[str, str]):
		r"""
		Update the relevant research fields of each directory.
		Typically used for manually set each directory's relevant research fields.

		Args:
			dir_description_dict (Dict[str, str]): the descriptions of the paper directories
				- key: the directory path relative to root;
				- value: the relevant research fields of the directory.
		"""
		for dir_id in dir_description_dict.keys():
			self._set_dir_metadata(
				dir_id=dir_id,
				key=PAPER_LEVEL_KEYWORDS,
				val=dir_description_dict[dir_id],
			)

	def _set_dir_metadata(self, dir_id: str, key: str, val: Any):
		dir_id_to_summary_id = self.directory_summary_index._index_struct.doc_id_to_summary_id
		node_collection = self.directory_summary_index.docstore._node_collection

		if dir_id in dir_id_to_summary_id.keys():
			summary_id = dir_id_to_summary_id[dir_id]
			summary_store = self.directory_summary_index.docstore._kvstore._data[node_collection][summary_id]
			summary_store["__data__"]["metadata"][key] = val

		self.directory_summary_index.storage_context.persist(persist_dir=self.directory_summary_persist_dir)

	def set_possessor_research_categories(self, possessor_category_dict: Dict[str, List[str]]):
		r"""
		Set the research categories of the possessors, this research categories is used to recommend proper new papers
		to the possessors.

		Args:
			possessor_category_dict (Dict[str, List[str]]): the research categories to be set.
				It is a dictionary with:

				- key: possessor
				- value: the list of research categories. For details about research categories,
				refer to the class `ArxivCategory`.
		"""
		for possessor in possessor_category_dict.keys():
			dir_id = str((Path(self.paper_root) / possessor).relative_to(self.root))
			self._set_dir_metadata(
				dir_id=dir_id,
				key=DIR_CATEGORY_NAME,
				val=possessor_category_dict[possessor],
			)

	def add_dir(self, directory: str, verbose: bool = False):
		r"""
		Add a directory to the paper storage.
		"""
		if Path(self.paper_root) not in Path(directory).parents:
			raise ValueError("Invalid directory path, please add your documents to the paper warehouse, "
							 "and store them in the PaperStorage first.")
		self._auto_summarize_dir(directory=directory, verbose=verbose)

labridge.func_modules.paper.store.paper_store.PaperDirectorySummaryStore.add_dir(directory, verbose=False)

Add a directory to the paper storage.

Source code in labridge\func_modules\paper\store\paper_store.py
677
678
679
680
681
682
683
684
def add_dir(self, directory: str, verbose: bool = False):
	r"""
	Add a directory to the paper storage.
	"""
	if Path(self.paper_root) not in Path(directory).parents:
		raise ValueError("Invalid directory path, please add your documents to the paper warehouse, "
						 "and store them in the PaperStorage first.")
	self._auto_summarize_dir(directory=directory, verbose=verbose)

labridge.func_modules.paper.store.paper_store.PaperDirectorySummaryStore.get_dir_nodes()

get the valid directory summary nodes

Source code in labridge\func_modules\paper\store\paper_store.py
539
540
541
542
543
544
545
546
547
548
549
def get_dir_nodes(self):
	r""" get the valid directory summary nodes """
	dir_id_to_summary_id = self.directory_summary_index._index_struct.doc_id_to_summary_id
	dir_summary_nodes = []
	for dir_id in dir_id_to_summary_id.keys():
		dir_path = self.root / dir_id
		if dir_path.exists():
			summary_id = dir_id_to_summary_id[dir_id]
			summary_node = self.directory_summary_index.docstore.get_node(summary_id)
			dir_summary_nodes.append(summary_node)
	return dir_summary_nodes

labridge.func_modules.paper.store.paper_store.PaperDirectorySummaryStore.match_directory_for_new_paper(pdf_path, possessor, paper_summary=None, verbose=False)

select the most relevant (and deepest) directory for the new paper.

PARAMETER DESCRIPTION
pdf_path

the path of the new paper.

TYPE: str

possessor

the possessor of this new paper.

TYPE: str

paper_summary

the summary of the new paper.

TYPE: str DEFAULT: None

verbose

whether to show progress.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
Union[str, None]

Union[str, None]: The matched directory for the new paper. If no proper directory found, return None.

Source code in labridge\func_modules\paper\store\paper_store.py
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
def match_directory_for_new_paper(
	self,
	pdf_path: str,
	possessor: str,
	paper_summary: str = None,
	verbose: bool=False,
) -> Union[str, None]:
	r"""
	select the most relevant (and deepest) directory for the new paper.

	Args:
		pdf_path (str): the path of the new paper.
		possessor (str): the possessor of this new paper.
		paper_summary (str): the summary of the new paper.
		verbose (bool): whether to show progress.

	Returns:
		Union[str, None]:
			The matched directory for the new paper. If no proper directory found, return None.
	"""
	pdf_path = Path(pdf_path)
	if pdf_path.suffix != ".pdf":
		raise ValueError("Only papers with PDF format are supported now.")
	possessor_dir = Path(self.paper_root) / possessor
	if not possessor_dir.exists():
		raise ValueError(f"The member {possessor} do not exist. Please sign up as a member first.")

	if paper_summary is None:
		pdf_docs = PyMuPDFReader().load_data(file_path=pdf_path)
		# typically, the first page includes conclusive information of a paper.
		paper_summary = pdf_docs[0].text

	dir_summary_nodes = self.get_dir_nodes()

	selected_nodes = []
	selected_relevances = []

	for idx in range(0, len(dir_summary_nodes), self.dir_choice_batch_size):
		summary_nodes = dir_summary_nodes[idx: idx + self.dir_choice_batch_size]
		dir_context_str = default_format_node_batch_fn(summary_nodes=summary_nodes)

		raw_response = self.llm.predict(
			DIR_CHOICE_SELECT_PROMPT,
			dir_context_str=dir_context_str,
			paper_str=paper_summary,
		)
		raw_choices, relevances = default_parse_choice_select_answer_fn(raw_response, len(summary_nodes))
		choice_idxs = [choice - 1 for choice in raw_choices]
		choice_summary_nodes = [summary_nodes[ci] for ci in choice_idxs]
		selected_nodes.extend(choice_summary_nodes)
		selected_relevances.extend(relevances)

	if len(selected_nodes) == 0:
		return None

	zipped_list = list(zip(selected_nodes, selected_relevances))
	sorted_list = sorted(zipped_list, key=lambda x: x[1], reverse=True)
	# choose the most relevant and the deepest directory.
	best_dir = sorted_list[0][0].ref_doc_id

	if verbose:
		for node, relevance in sorted_list:
			print_text(f">>> dir: {node.ref_doc_id}, relevance: {relevance}", color="blue", end="\n")
	def sub_dir_nodes(paper_dir: str):
		sub_nodes_with_score = []
		for node, score in sorted_list:
			if Path(paper_dir) in Path(node.ref_doc_id).parents:
				sub_nodes_with_score.append((node, score))
		return sub_nodes_with_score

	sub_list = sub_dir_nodes(best_dir)
	while len(sub_list) > 0:
		sub_list = sorted(sub_list, key=lambda x: x[1], reverse=True)
		best_dir = sub_list[0][0].ref_doc_id
		sub_list = sub_dir_nodes(best_dir)
	return best_dir

labridge.func_modules.paper.store.paper_store.PaperDirectorySummaryStore.set_possessor_research_categories(possessor_category_dict)

Set the research categories of the possessors, this research categories is used to recommend proper new papers to the possessors.

PARAMETER DESCRIPTION
possessor_category_dict

the research categories to be set. It is a dictionary with:

  • key: possessor
  • value: the list of research categories. For details about research categories, refer to the class ArxivCategory.

TYPE: Dict[str, List[str]]

Source code in labridge\func_modules\paper\store\paper_store.py
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
def set_possessor_research_categories(self, possessor_category_dict: Dict[str, List[str]]):
	r"""
	Set the research categories of the possessors, this research categories is used to recommend proper new papers
	to the possessors.

	Args:
		possessor_category_dict (Dict[str, List[str]]): the research categories to be set.
			It is a dictionary with:

			- key: possessor
			- value: the list of research categories. For details about research categories,
			refer to the class `ArxivCategory`.
	"""
	for possessor in possessor_category_dict.keys():
		dir_id = str((Path(self.paper_root) / possessor).relative_to(self.root))
		self._set_dir_metadata(
			dir_id=dir_id,
			key=DIR_CATEGORY_NAME,
			val=possessor_category_dict[possessor],
		)

labridge.func_modules.paper.store.paper_store.PaperDirectorySummaryStore.update(dir_description_dict)

Update the relevant research fields of each directory. Typically used for manually set each directory's relevant research fields.

PARAMETER DESCRIPTION
dir_description_dict

the descriptions of the paper directories - key: the directory path relative to root; - value: the relevant research fields of the directory.

TYPE: Dict[str, str]

Source code in labridge\func_modules\paper\store\paper_store.py
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
def update(self, dir_description_dict: Dict[str, str]):
	r"""
	Update the relevant research fields of each directory.
	Typically used for manually set each directory's relevant research fields.

	Args:
		dir_description_dict (Dict[str, str]): the descriptions of the paper directories
			- key: the directory path relative to root;
			- value: the relevant research fields of the directory.
	"""
	for dir_id in dir_description_dict.keys():
		self._set_dir_metadata(
			dir_id=dir_id,
			key=PAPER_LEVEL_KEYWORDS,
			val=dir_description_dict[dir_id],
		)

labridge.func_modules.paper.store.paper_store.PaperStorage

Bases: object

Store the papers in vector index and summary index. The vector index stores the text chunks of the main text (and methods) and their embeddings. The summary index stores the summaries of the papers. Note that they can not share the storage context.

PARAMETER DESCRIPTION
docs

the Documents to be stored.

TYPE: List[Document] DEFAULT: None

extra_docs

extra Documents (like References), they are stored in the docstore of the index.

TYPE: List[Document] DEFAULT: None

vector_index

existing vector index.

TYPE: VectorStoreIndex DEFAULT: None

vector_persist_dir

the store directory of the vector index.

TYPE: Union[str, PathLike] DEFAULT: None

vector_transformations

the transformations used in the construction of the vector index.

TYPE: List[TransformComponent] DEFAULT: None

paper_summary_index

existing summary index.

TYPE: DocumentSummaryIndex DEFAULT: None

paper_summary_persist_dir

the store directory of the summary index.

TYPE: Union[str, PathLike] DEFAULT: None

paper_summary_query

the query used in summarizing the papers.

TYPE: str DEFAULT: PAPER_SUMMARIZE_QUERY

summary_transformations

the transformations used in the construction of the summary index.

TYPE: List[TransformComponent] DEFAULT: None

summary_synthesizer

the synthesizer used in summarizing the papers.

TYPE: PaperBatchSummarize DEFAULT: None

vector_storage_context

the storage context of the vector index.

TYPE: StorageContext DEFAULT: None

paper_summary_storage_context

the storage context of the summary index.

TYPE: StorageContext DEFAULT: None

service_context

the service context.

TYPE: ServiceContext DEFAULT: None

Source code in labridge\func_modules\paper\store\paper_store.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
class PaperStorage(object):
	r"""
	Store the papers in vector index and summary index.
	The vector index stores the text chunks of the main text (and methods) and their embeddings.
	The summary index stores the summaries of the papers.
	Note that they can not share the storage context.

	Args:
		docs (List[Document]): the Documents to be stored.
		extra_docs (List[Document]): extra Documents (like References),
			they are stored in the docstore of the index.
		vector_index (VectorStoreIndex): existing vector index.
		vector_persist_dir (Union[str, os.PathLike]): the store directory of the vector index.
		vector_transformations (List[TransformComponent]): the transformations used in the construction of the vector index.
		paper_summary_index (DocumentSummaryIndex): existing summary index.
		paper_summary_persist_dir (Union[str, os.PathLike]): the store directory of the summary index.
		paper_summary_query (str): the query used in summarizing the papers.
		summary_transformations (List[TransformComponent]): the transformations used in the construction of the summary index.
		summary_synthesizer (PaperBatchSummarize): the synthesizer used in summarizing the papers.
		vector_storage_context (StorageContext): the storage context of the vector index.
		paper_summary_storage_context (StorageContext): the storage context of the summary index.
		service_context (ServiceContext): the service context.
	"""
	def __init__(
		self,
		docs: Optional[List[Document]] = None,
		extra_docs: Optional[List[Document]] = None,
		vector_index: Optional[VectorStoreIndex] = None,
		vector_persist_dir: Union[str, os.PathLike] = None,
		vector_transformations: List[TransformComponent] = None,
		paper_summary_index: Optional[DocumentSummaryIndex] = None,
		paper_summary_persist_dir: Union[str, os.PathLike] = None,
		paper_summary_query: str = PAPER_SUMMARIZE_QUERY,
		summary_transformations: List[TransformComponent] = None,
		summary_synthesizer: Optional[PaperBatchSummarize] = None,
		vector_storage_context: Optional[StorageContext] = None,
		paper_summary_storage_context: Optional[StorageContext] = None,
		service_context: Optional[ServiceContext] = None,
	):
		root = Path(__file__)
		for i in range(5):
			root = root.parent
		self.root = root
		self.llm = llm_from_settings_or_context(Settings, service_context)
		self.embed_model = embed_model_from_settings_or_context(Settings, service_context)
		self.service_context = service_context
		self.vector_persist_dir = vector_persist_dir or self._default_vector_persist_dir()
		self.paper_summary_persist_dir = paper_summary_persist_dir or self._default_paper_summary_persist_dir()
		self.vector_transformations = vector_transformations or self._default_vector_transformations()
		self.summary_transformations = summary_transformations or self._default_summary_transformations()
		self.summary_synthesizer = summary_synthesizer
		self.paper_summary_query = paper_summary_query
		if summary_synthesizer is None:
			self.summary_synthesizer = PaperBatchSummarize(llm=self.llm, max_tokens=8000, overlap_chunk_num=1)

		if (vector_index is None or paper_summary_index is None) and (docs is None or extra_docs is None):
			raise ValueError("Please provide (docs, extra_docs) or existed (vector_index, summary_index).")
		if None not in (vector_index, paper_summary_index):
			assert vector_index.storage_context != paper_summary_index.storage_context
			self.vector_index, self.paper_summary_index = vector_index, paper_summary_index
			self.paper_summary_index._response_synthesizer = self.summary_synthesizer
			self.vector_storage_context = vector_index.storage_context
			self.paper_summary_storage_context = paper_summary_index.storage_context
		else:
			self.vector_storage_context = vector_storage_context or StorageContext.from_defaults()
			self.paper_summary_storage_context = paper_summary_storage_context or StorageContext.from_defaults()
			self.build_index_from_docs(docs=docs, extra_docs=extra_docs)

	def _default_vector_persist_dir(self) -> str:
		return str(self.root / DEFAULT_PAPER_VECTOR_PERSIST_DIR)

	def _default_paper_summary_persist_dir(self) -> str:
		return str(self.root / DEFAULT_PAPER_SUMMARY_PERSIST_DIR)

	def _default_vector_transformations(self) -> List[TransformComponent]:
		return [SentenceSplitter(chunk_size=1024, chunk_overlap=256, include_metadata=True), ]

	def _default_summary_transformations(self) -> List[TransformComponent]:
		return [SentenceSplitter(chunk_size=1024, chunk_overlap=256, include_metadata=True), ]

	def build_vector_index_from_docs(self, docs: List[Document]) -> VectorStoreIndex:
		r"""
		Build a vector database from the paper docs.

		Args:
			docs (List[Document]): The paper Documents.

		Returns:
			VectorStoreIndex
		"""
		if not self._are_valid_docs(docs):
			raise ValueError(f"Doc not in paper warehouse.")
		vector_index = VectorStoreIndex.from_documents(documents=docs,
													   storage_context=self.vector_storage_context,
													   show_progress=True,
													   transformations=self.vector_transformations,
													   service_context=self.service_context)
		vector_index.set_index_id(PAPER_VECTOR_INDEX_ID)
		return vector_index

	def build_paper_summary_index_from_docs(self, docs: List[Document]) -> DocumentSummaryIndex:
		r"""
		Build a summary vector database from the paper docs.

		Args:
			docs (List[Document]): The paper Documents.

		Returns:
			DocumentSummaryIndex
		"""
		if not self._are_valid_docs(docs):
			raise ValueError(f"Doc not in paper warehouse.")
		paper_summary_index = DocumentSummaryIndex.from_documents(
			documents=docs,
			storage_context=self.paper_summary_storage_context,
			show_progress=True,
			transformations=self.summary_transformations,
			summary_query = self.paper_summary_query,
			service_context=self.service_context,
			response_synthesizer = self.summary_synthesizer,
		)
		paper_summary_index.set_index_id(PAPER_SUMMARY_INDEX_ID)
		return paper_summary_index

	def build_index_from_docs(
		self,
		docs: List[Document],
		extra_docs: List[Document],
	):
		if not self._are_valid_docs(docs + extra_docs):
			raise ValueError(f"Doc not in paper warehouse.")

		self.vector_index = self.build_vector_index_from_docs(docs=docs[:1])
		self.paper_summary_index = self.build_paper_summary_index_from_docs(docs=docs[:1])
		self.persist()
		self.insert(paper_docs=docs[1:], extra_docs=extra_docs)
		# vector_index = self.build_vector_index_from_docs(docs)
		# paper_summary_index = self.build_paper_summary_index_from_docs(docs)
		# vector_index.docstore.add_documents(extra_docs)
		# paper_summary_index.docstore.add_documents(extra_docs)
		# return vector_index, paper_summary_index

	@classmethod
	def from_storage(
		cls,
		vector_persist_dir: str,
		paper_summary_persist_dir: str,
		vector_transformations: List[TransformComponent] = None,
		paper_summary_query: str = PAPER_SUMMARIZE_QUERY,
		summary_transformations: List[TransformComponent] = None,
		summary_synthesizer: Optional[BaseSynthesizer] = None,
		service_context: Optional[ServiceContext] = None,
	):
		r""" Load from an existing storage. """
		root = Path(__file__)
		for i in range(5):
			root = root.parent

		vector_persist_dir = vector_persist_dir or str(root / DEFAULT_PAPER_VECTOR_PERSIST_DIR)
		paper_summary_persist_dir = paper_summary_persist_dir or str(root / DEFAULT_PAPER_SUMMARY_PERSIST_DIR)
		vector_storage_context = StorageContext.from_defaults(persist_dir=vector_persist_dir)
		paper_summary_storage_context = StorageContext.from_defaults(persist_dir=paper_summary_persist_dir)

		vector_index = load_index_from_storage(
			storage_context=vector_storage_context,
			index_id=PAPER_VECTOR_INDEX_ID,
			service_context=service_context,
		)
		paper_summary_index = load_index_from_storage(
			storage_context=paper_summary_storage_context,
			index_id=PAPER_SUMMARY_INDEX_ID,
			service_context=service_context,
		)

		return cls(
			vector_index=vector_index,
			paper_summary_index=paper_summary_index,
			vector_transformations=vector_transformations,
			vector_persist_dir=vector_persist_dir,
			paper_summary_persist_dir=paper_summary_persist_dir,
			paper_summary_query=paper_summary_query,
			summary_transformations=summary_transformations,
			summary_synthesizer=summary_synthesizer,
			service_context=service_context,
		)

	def _is_valid_doc(self, doc: Document) -> bool:
		r""" Judge whether the paper doc is from the paper warehouse. """
		doc_id = doc.doc_id
		if CONTENT_TYPE_NAME not in doc.metadata.keys():
			return False
		doc_type = doc.metadata[CONTENT_TYPE_NAME]
		rel_path = doc_id.split(f'_{doc_type}')[0]
		doc_path = self.root / rel_path
		return doc_path.exists()

	def _are_valid_docs(self, docs: List[Document]) -> bool:
		for doc in docs:
			if not self._is_valid_doc(doc):
				print(f"Invalid doc. Doc {doc.doc_id} is not in paper warehouse.")
				return False
		return True

	def insert(self, paper_docs: List[Document], extra_docs: List[Document]):
		r"""
		Add new papers to index.
		Assert all new papers are already categorized (that is: they are from the organized paper warehouse.)

		Encourage you to build a storage with one paper first, then use `insert` methods to add other papers,
		because we can control the summarize query depending on each doc's type.

		Args:
			paper_docs (List[Document]): these docs will be summarized; chunked and vectorized.
			extra_docs (List[Document]): these docs are stored in docstore.
		"""
		if not self._are_valid_docs(paper_docs + extra_docs):
			raise ValueError(f"Doc not in paper warehouse.")

		for doc in paper_docs:
			doc_type = doc.metadata[CONTENT_TYPE_NAME]
			if doc_type not in SummarizeQueries.keys():
				raise ValueError(f'Invalid paper doc type: {doc_type}. Acceptable: {list(SummarizeQueries.keys())}.')
			sum_query = SummarizeQueries[doc_type]
			self.paper_summary_index._response_synthesizer._summary_query = sum_query

			if doc.doc_id not in self.paper_summary_index.docstore.get_all_ref_doc_info().keys():
				self.paper_summary_index.insert(document=doc)
			if doc.doc_id not in self.vector_index.docstore.get_all_ref_doc_info().keys():
				self.vector_index.insert(document=doc)

		self.vector_index.docstore.add_documents(extra_docs)
		self.paper_summary_index.docstore.add_documents(extra_docs)
		self.persist()

	def persist(self,
				vector_persist_dir: Union[str, os.PathLike] = None,
				paper_summary_persist_dir: Union[str, os.PathLike] = None):
		r""" Persist to the disk. """
		if vector_persist_dir is None:
			vector_persist_dir = self.vector_persist_dir
		if paper_summary_persist_dir is None:
			paper_summary_persist_dir = self.paper_summary_persist_dir
		self.vector_storage_context.persist(vector_persist_dir)
		self.paper_summary_storage_context.persist(paper_summary_persist_dir)

labridge.func_modules.paper.store.paper_store.PaperStorage.build_paper_summary_index_from_docs(docs)

Build a summary vector database from the paper docs.

PARAMETER DESCRIPTION
docs

The paper Documents.

TYPE: List[Document]

RETURNS DESCRIPTION
DocumentSummaryIndex

DocumentSummaryIndex

Source code in labridge\func_modules\paper\store\paper_store.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def build_paper_summary_index_from_docs(self, docs: List[Document]) -> DocumentSummaryIndex:
	r"""
	Build a summary vector database from the paper docs.

	Args:
		docs (List[Document]): The paper Documents.

	Returns:
		DocumentSummaryIndex
	"""
	if not self._are_valid_docs(docs):
		raise ValueError(f"Doc not in paper warehouse.")
	paper_summary_index = DocumentSummaryIndex.from_documents(
		documents=docs,
		storage_context=self.paper_summary_storage_context,
		show_progress=True,
		transformations=self.summary_transformations,
		summary_query = self.paper_summary_query,
		service_context=self.service_context,
		response_synthesizer = self.summary_synthesizer,
	)
	paper_summary_index.set_index_id(PAPER_SUMMARY_INDEX_ID)
	return paper_summary_index

labridge.func_modules.paper.store.paper_store.PaperStorage.build_vector_index_from_docs(docs)

Build a vector database from the paper docs.

PARAMETER DESCRIPTION
docs

The paper Documents.

TYPE: List[Document]

RETURNS DESCRIPTION
VectorStoreIndex

VectorStoreIndex

Source code in labridge\func_modules\paper\store\paper_store.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def build_vector_index_from_docs(self, docs: List[Document]) -> VectorStoreIndex:
	r"""
	Build a vector database from the paper docs.

	Args:
		docs (List[Document]): The paper Documents.

	Returns:
		VectorStoreIndex
	"""
	if not self._are_valid_docs(docs):
		raise ValueError(f"Doc not in paper warehouse.")
	vector_index = VectorStoreIndex.from_documents(documents=docs,
												   storage_context=self.vector_storage_context,
												   show_progress=True,
												   transformations=self.vector_transformations,
												   service_context=self.service_context)
	vector_index.set_index_id(PAPER_VECTOR_INDEX_ID)
	return vector_index

labridge.func_modules.paper.store.paper_store.PaperStorage.from_storage(vector_persist_dir, paper_summary_persist_dir, vector_transformations=None, paper_summary_query=PAPER_SUMMARIZE_QUERY, summary_transformations=None, summary_synthesizer=None, service_context=None) classmethod

Load from an existing storage.

Source code in labridge\func_modules\paper\store\paper_store.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
@classmethod
def from_storage(
	cls,
	vector_persist_dir: str,
	paper_summary_persist_dir: str,
	vector_transformations: List[TransformComponent] = None,
	paper_summary_query: str = PAPER_SUMMARIZE_QUERY,
	summary_transformations: List[TransformComponent] = None,
	summary_synthesizer: Optional[BaseSynthesizer] = None,
	service_context: Optional[ServiceContext] = None,
):
	r""" Load from an existing storage. """
	root = Path(__file__)
	for i in range(5):
		root = root.parent

	vector_persist_dir = vector_persist_dir or str(root / DEFAULT_PAPER_VECTOR_PERSIST_DIR)
	paper_summary_persist_dir = paper_summary_persist_dir or str(root / DEFAULT_PAPER_SUMMARY_PERSIST_DIR)
	vector_storage_context = StorageContext.from_defaults(persist_dir=vector_persist_dir)
	paper_summary_storage_context = StorageContext.from_defaults(persist_dir=paper_summary_persist_dir)

	vector_index = load_index_from_storage(
		storage_context=vector_storage_context,
		index_id=PAPER_VECTOR_INDEX_ID,
		service_context=service_context,
	)
	paper_summary_index = load_index_from_storage(
		storage_context=paper_summary_storage_context,
		index_id=PAPER_SUMMARY_INDEX_ID,
		service_context=service_context,
	)

	return cls(
		vector_index=vector_index,
		paper_summary_index=paper_summary_index,
		vector_transformations=vector_transformations,
		vector_persist_dir=vector_persist_dir,
		paper_summary_persist_dir=paper_summary_persist_dir,
		paper_summary_query=paper_summary_query,
		summary_transformations=summary_transformations,
		summary_synthesizer=summary_synthesizer,
		service_context=service_context,
	)

labridge.func_modules.paper.store.paper_store.PaperStorage.insert(paper_docs, extra_docs)

Add new papers to index. Assert all new papers are already categorized (that is: they are from the organized paper warehouse.)

Encourage you to build a storage with one paper first, then use insert methods to add other papers, because we can control the summarize query depending on each doc's type.

PARAMETER DESCRIPTION
paper_docs

these docs will be summarized; chunked and vectorized.

TYPE: List[Document]

extra_docs

these docs are stored in docstore.

TYPE: List[Document]

Source code in labridge\func_modules\paper\store\paper_store.py
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
def insert(self, paper_docs: List[Document], extra_docs: List[Document]):
	r"""
	Add new papers to index.
	Assert all new papers are already categorized (that is: they are from the organized paper warehouse.)

	Encourage you to build a storage with one paper first, then use `insert` methods to add other papers,
	because we can control the summarize query depending on each doc's type.

	Args:
		paper_docs (List[Document]): these docs will be summarized; chunked and vectorized.
		extra_docs (List[Document]): these docs are stored in docstore.
	"""
	if not self._are_valid_docs(paper_docs + extra_docs):
		raise ValueError(f"Doc not in paper warehouse.")

	for doc in paper_docs:
		doc_type = doc.metadata[CONTENT_TYPE_NAME]
		if doc_type not in SummarizeQueries.keys():
			raise ValueError(f'Invalid paper doc type: {doc_type}. Acceptable: {list(SummarizeQueries.keys())}.')
		sum_query = SummarizeQueries[doc_type]
		self.paper_summary_index._response_synthesizer._summary_query = sum_query

		if doc.doc_id not in self.paper_summary_index.docstore.get_all_ref_doc_info().keys():
			self.paper_summary_index.insert(document=doc)
		if doc.doc_id not in self.vector_index.docstore.get_all_ref_doc_info().keys():
			self.vector_index.insert(document=doc)

	self.vector_index.docstore.add_documents(extra_docs)
	self.paper_summary_index.docstore.add_documents(extra_docs)
	self.persist()

labridge.func_modules.paper.store.paper_store.PaperStorage.persist(vector_persist_dir=None, paper_summary_persist_dir=None)

Persist to the disk.

Source code in labridge\func_modules\paper\store\paper_store.py
314
315
316
317
318
319
320
321
322
323
def persist(self,
			vector_persist_dir: Union[str, os.PathLike] = None,
			paper_summary_persist_dir: Union[str, os.PathLike] = None):
	r""" Persist to the disk. """
	if vector_persist_dir is None:
		vector_persist_dir = self.vector_persist_dir
	if paper_summary_persist_dir is None:
		paper_summary_persist_dir = self.paper_summary_persist_dir
	self.vector_storage_context.persist(vector_persist_dir)
	self.paper_summary_storage_context.persist(paper_summary_persist_dir)