Paper reader

`labridge.func_modules.paper.parse.paper_reader` ¶

`labridge.func_modules.paper.parse.paper_reader.PaperReader` ¶

Read a PDF paper, and extract valid meta_data from it.

PARAMETER	DESCRIPTION
`llm`	the used llm, if not provided, use the llm from `service_context`. Defaults to None. TYPE: `LLM` DEFAULT: `None`
`source_keyword_threshold`	used in PaperSourceAnalyzer. refer to PaperSourceAnalyzer for details. Defaults to 10 TYPE: `int` DEFAULT: `10`
`use_llm_for_source`	whether to use LLM in the source analyzer. Defaults to True. TYPE: `bool` DEFAULT: `True`
`extract_metadata`	whether to use LLM to extract metadata for papers. Defaults to True. TYPE: `bool` DEFAULT: `True`
`necessary_metadata`	Paper level metadata. The necessary metadata that must be extracted. It is a dictionary with k-v pairs like: {metadata_name: description}. The description is used to instruct the llm to extract the corresponding metadata. For example: key: "Title" value: "The title often appears as a single concise sentence at the head of a paper." TYPE: `Dict[str, str]` DEFAULT: `None`
`optional_metadata`	Paper level metadata. The optional metadata that is not forced to extract from the paper. It is a dictionary with k-v pairs like: {metadata_name: description}. TYPE: `Dict[str, str]` DEFAULT: `None`
`extract_retry_times`	max retry times if not all necessary metadata is extracted. TYPE: `int` DEFAULT: `2`
`service_context`	the service context. TYPE: `ServiceContext` DEFAULT: `None`
`recursive`	Whether to recursively search in subdirectories. TYPE: `bool` DEFAULT: `False`

False by default.
    exclude (List): glob of python file paths to exclude (Optional)

exclude_hidden (bool): Whether to exclude hidden files (dotfiles). required_exts (Optional[List[str]]): List of required extensions. Default is None. num_files_limit (Optional[int]): Maximum number of files to read. Default is None. filename_as_id (bool): whether to use the filename as the document id. True by default. If set to True, the doc node will be named as {file_path}_{content_type}. The file_path is relative to root directory.

Source code in labridge\func_modules\paper\parse\paper_reader.py

class PaperReader:
	r"""
	Read a PDF paper, and extract valid meta_data from it.

	Args:
		llm (LLM: the used llm, if not provided, use the llm from `service_context`.
			Defaults to None.
		source_keyword_threshold (int): used in PaperSourceAnalyzer. refer to PaperSourceAnalyzer for details.
			Defaults to 10
		use_llm_for_source (bool): whether to use LLM in the source analyzer. Defaults to True.
		extract_metadata (bool): whether to use LLM to extract metadata for papers. Defaults to True.
		necessary_metadata (Dict[str, str]): Paper level metadata.
			The necessary metadata that must be extracted.
		 	It is a dictionary with k-v pairs like: {metadata_name: description}. The description
		 	is used to instruct the llm to extract the corresponding metadata.
		 	For example:

		 	- key: "Title"
		 	- value: "The title often appears as a single concise sentence at the head of a paper."
		optional_metadata (Dict[str, str]): Paper level metadata.
			The optional metadata that is not forced to extract from the paper.
			It is a dictionary with k-v pairs like: {metadata_name: description}.
		extract_retry_times: max retry times if not all necessary metadata is extracted.
		service_context (ServiceContext): the service context.
		recursive (bool): Whether to recursively search in subdirectories.
            False by default.
		exclude (List): glob of python file paths to exclude (Optional)
        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
        required_exts (Optional[List[str]]): List of required extensions.
            Default is None.
        num_files_limit (Optional[int]): Maximum number of files to read.
            Default is None.
		filename_as_id (bool): whether to use the filename as the document id. True by default.
			If set to True, the doc node will be named as `{file_path}_{content_type}`.
			The file_path is relative to root directory.
	"""
	def __init__(
		self,
		llm: LLM = None,
		source_keyword_threshold: int = 10,
		use_llm_for_source: bool = True,
		extract_metadata: bool = True,
		necessary_metadata: Dict[str, str] = None,
		optional_metadata: Dict[str, str] = None,
		extract_retry_times: int = 2,
		filename_as_id: bool = True,
		service_context: ServiceContext = None,
		recursive: bool = False,
		exclude: Optional[List] = None,
		exclude_hidden: bool = True,
		required_exts: Optional[List[str]] = None,
		num_files_limit: Optional[int] = None,
		fs: Optional[fsspec.AbstractFileSystem] = None,
	):
		self.metadata_extractor = None
		self.extract_metadata = extract_metadata
		if extract_metadata:
			self.metadata_extractor = PaperMetadataExtractor(
				llm=llm,
				necessary_metadata=necessary_metadata,
				optional_metadata=optional_metadata,
				max_retry_times=extract_retry_times,
				service_context=service_context,
			)
		if llm is None:
			self.llm = llm_from_settings_or_context(Settings, service_context)
		else:
			self.llm = llm

		self.source_analyzer = PaperSourceAnalyzer(llm=self.llm, keyword_count_threshold=source_keyword_threshold)
		self.use_llm_for_source = use_llm_for_source
		self.filename_as_id = filename_as_id
		self.recursive = recursive
		self.exclude = exclude
		self.exclude_hidden = exclude_hidden
		self.required_exts = required_exts
		self.num_files_limit = num_files_limit
		self.fs = fs or LocalFileSystem()
		root = Path(__file__)
		for i in range(5):
			root = root.parent
		self.root = root

	def get_paper_possessor(self, paper_path: Union[Path, str]) -> str:
		r"""
		Get the possessor of this paper.
		Assume the possessor is the first level directory under the paper warehouse.

		Args:
			paper_path (Union[Path, str]): The file path of paper.

		Returns:
			str: The paper possessor.
		"""
		if isinstance(paper_path, str):
			paper_path = Path(paper_path)
		try:
			paper_warehouse = self.root / SHARED_PAPER_WAREHOUSE_DIR
			rel = paper_path.relative_to(paper_warehouse)
			possessor = rel.parts[0]
			return possessor
		except ValueError:
			raise ValueError("The path of the paper is not valid, a valid path should be under the PaperWarehouse directory.")

	def read_single_paper(
		self,
		file_path: Union[Path, str],
		show_progress: bool = True,
		extra_metadata: dict = None,
	) -> Optional[Tuple[List[Document], List[Document]]]:
		r"""
		Read a single pdf paper.

		Args:
			file_path (Union[Path, str]): the path of pdf paper.
			show_progress (bool): show parsing progress.
			extra_metadata (dict): Existing metadata obtained by approaches such as arXiv API.

		Returns:
			Tuple[List[Document], List[Document]]:
				The ingested content docs and extra docs.

				- chunk_docs: the docs for retrieving, include information such as main text, methods.
				Might be None if nothing is parsed (auto_parse_paper fails.)
				- extra_docs: docs that involve supplementary information such as references.
				Might be None.
		"""
		if isinstance(file_path, str):
			file_path = Path(file_path)
		if str(file_path)[-4:] != '.pdf':
			raise ValueError("Expect a PDF file.")
		if show_progress:
			print_text(f">>> Loading {file_path}", color="blue", end="\n")
		parsed_docs = auto_parse_paper(
			file_path=file_path,
			source_analyzer=self.source_analyzer,
			use_llm_for_source=self.use_llm_for_source
		)

		chunk_docs, extra_docs, metadata_docs = [], [], []
		for doc in parsed_docs:
			if doc.metadata[CONTENT_TYPE_NAME] in MetadataContents:
				metadata_docs.append(doc)
			elif doc.metadata[CONTENT_TYPE_NAME] in ChunkContents:
				chunk_docs.append(doc)
			else:
				extra_docs.append(doc)

		# metadata
		paper_metadata = dict()

		if self.extract_metadata:
			paper_metadata = self.metadata_extractor.extract_paper_metadata(
				pdf_path=file_path,
				extra_metadata=extra_metadata,
			)
			if paper_metadata is None:
				print(f"Loading DOI failed: {file_path}")
				return None

			for meta_doc in metadata_docs:
				metadata_name = meta_doc.metadata[CONTENT_TYPE_NAME]
				if metadata_name not in paper_metadata.keys():
					paper_metadata[metadata_name] = meta_doc.text

		possessor = self.get_paper_possessor(file_path)
		paper_metadata[PAPER_POSSESSOR] = possessor
		paper_metadata[PAPER_REL_FILE_PATH] = str(file_path.relative_to(self.root))

		for idx, doc in enumerate(parsed_docs):
			doc.metadata.update(paper_metadata)
			if self.filename_as_id:
				rel_path = str(file_path.relative_to(self.root))
				doc.id_ = f"{rel_path!s}_{doc.metadata[CONTENT_TYPE_NAME]}"
		return chunk_docs, extra_docs

	def read_papers(
		self,
		input_dir: Optional[str] = None,
		input_files: Optional[List] = None,
		show_progress: bool = True,
	) -> Tuple[List[Document], List[Document]]:
		r"""
		Read papers.

		Args:
			input_dir (Optional[str]): the paper directory.
			input_files (Optional[List]): the paths of papers. If it is specified, the `input_dir` is ignored.
			show_progress (bool): show parsing progress.

		Returns:
			Tuple[List[Document], List[Document]]:
				the content docs and the extra docs.

				- contents: for retrieving, each sequence in the list contains the content docs of a paper.
				- extra_info: extra info, each sequence in the list contains the extra docs of a paper.
		"""
		_Path = Path if is_default_fs(self.fs) else PurePosixPath
		paper_files = None
		if input_files:
			paper_files = []
			for path in input_files:
				if not self.fs.isfile(path):
					raise ValueError(f"File {path} does not exist.")
				input_file = _Path(path)
				paper_files.append(input_file)
		elif input_dir:
			if not self.fs.isdir(input_dir):
				raise ValueError(f"Directory {input_dir} does not exist.")
			input_dir = _Path(input_dir)
			paper_files = self._add_files(input_dir)

		contents, extra_info = [], []
		if paper_files is not None:
			for idx, paper in enumerate(paper_files):
				if str(paper)[-4:] != '.pdf':
					continue
				content_docs, extra_docs = self.read_single_paper(file_path=paper, show_progress=show_progress)
				contents += content_docs
				extra_info += extra_docs
		return contents, extra_info

	def is_hidden(self, path: Path) -> bool:
		return any(part.startswith(".") and part not in [".", ".."] for part in path.parts)

	def _add_files(self, input_dir: Path) -> List[Path]:
		"""Add files."""
		all_files = set()
		rejected_files = set()
		rejected_dirs = set()
		# Default to POSIX paths for non-default file systems (e.g. S3)
		_Path = Path if is_default_fs(self.fs) else PurePosixPath

		if self.exclude is not None:
			for excluded_pattern in self.exclude:
				if self.recursive:
					# Recursive glob
					excluded_glob = _Path(input_dir) / _Path("**") / excluded_pattern
				else:
					# Non-recursive glob
					excluded_glob = _Path(input_dir) / excluded_pattern
				for file in self.fs.glob(str(excluded_glob)):
					if self.fs.isdir(file):
						rejected_dirs.add(_Path(file))
					else:
						rejected_files.add(_Path(file))

		file_refs: List[str] = []
		if self.recursive:
			file_refs = self.fs.glob(str(input_dir) + "/**/*")
		else:
			file_refs = self.fs.glob(str(input_dir) + "/*")

		for ref in file_refs:
			# Manually check if file is hidden or directory instead of
			# in glob for backwards compatibility.
			ref = _Path(ref)
			is_dir = self.fs.isdir(ref)
			skip_because_hidden = self.exclude_hidden and self.is_hidden(ref)
			skip_because_bad_ext = (self.required_exts is not None and ref.suffix not in self.required_exts)
			skip_because_excluded = ref in rejected_files
			if not skip_because_excluded:
				if is_dir:
					ref_parent_dir = ref
				else:
					ref_parent_dir = self.fs._parent(ref)
				for rejected_dir in rejected_dirs:
					if str(ref_parent_dir).startswith(str(rejected_dir)):
						skip_because_excluded = True
						logger.debug("Skipping %s because it in parent dir %s which is in %s", ref, ref_parent_dir,
							rejected_dir, )
						break

			if (is_dir or skip_because_hidden or skip_because_bad_ext or skip_because_excluded):
				continue
			else:
				all_files.add(ref)

		new_input_files = sorted(all_files)

		if len(new_input_files) == 0:
			raise ValueError(f"No files found in {input_dir}.")

		if self.num_files_limit is not None and self.num_files_limit > 0:
			new_input_files = new_input_files[0: self.num_files_limit]

		# print total number of files added
		logger.debug(f"> [PaperReader] Total files added: {len(new_input_files)}")

		return new_input_files

`labridge.func_modules.paper.parse.paper_reader.PaperReader.get_paper_possessor(paper_path)` ¶

Get the possessor of this paper. Assume the possessor is the first level directory under the paper warehouse.

PARAMETER	DESCRIPTION
`paper_path`	The file path of paper. TYPE: `Union[Path, str]`

RETURNS	DESCRIPTION
`str`	The paper possessor. TYPE: `str`

Source code in labridge\func_modules\paper\parse\paper_reader.py

def get_paper_possessor(self, paper_path: Union[Path, str]) -> str:
	r"""
	Get the possessor of this paper.
	Assume the possessor is the first level directory under the paper warehouse.

	Args:
		paper_path (Union[Path, str]): The file path of paper.

	Returns:
		str: The paper possessor.
	"""
	if isinstance(paper_path, str):
		paper_path = Path(paper_path)
	try:
		paper_warehouse = self.root / SHARED_PAPER_WAREHOUSE_DIR
		rel = paper_path.relative_to(paper_warehouse)
		possessor = rel.parts[0]
		return possessor
	except ValueError:
		raise ValueError("The path of the paper is not valid, a valid path should be under the PaperWarehouse directory.")

`labridge.func_modules.paper.parse.paper_reader.PaperReader.read_papers(input_dir=None, input_files=None, show_progress=True)` ¶

Read papers.

PARAMETER	DESCRIPTION
`input_dir`	the paper directory. TYPE: `Optional[str]` DEFAULT: `None`
`input_files`	the paths of papers. If it is specified, the `input_dir` is ignored. TYPE: `Optional[List]` DEFAULT: `None`
`show_progress`	show parsing progress. TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
`Tuple[List[Document], List[Document]]`	Tuple[List[Document], List[Document]]: the content docs and the extra docs. contents: for retrieving, each sequence in the list contains the content docs of a paper. extra_info: extra info, each sequence in the list contains the extra docs of a paper.

Source code in labridge\func_modules\paper\parse\paper_reader.py

def read_papers(
	self,
	input_dir: Optional[str] = None,
	input_files: Optional[List] = None,
	show_progress: bool = True,
) -> Tuple[List[Document], List[Document]]:
	r"""
	Read papers.

	Args:
		input_dir (Optional[str]): the paper directory.
		input_files (Optional[List]): the paths of papers. If it is specified, the `input_dir` is ignored.
		show_progress (bool): show parsing progress.

	Returns:
		Tuple[List[Document], List[Document]]:
			the content docs and the extra docs.

			- contents: for retrieving, each sequence in the list contains the content docs of a paper.
			- extra_info: extra info, each sequence in the list contains the extra docs of a paper.
	"""
	_Path = Path if is_default_fs(self.fs) else PurePosixPath
	paper_files = None
	if input_files:
		paper_files = []
		for path in input_files:
			if not self.fs.isfile(path):
				raise ValueError(f"File {path} does not exist.")
			input_file = _Path(path)
			paper_files.append(input_file)
	elif input_dir:
		if not self.fs.isdir(input_dir):
			raise ValueError(f"Directory {input_dir} does not exist.")
		input_dir = _Path(input_dir)
		paper_files = self._add_files(input_dir)

	contents, extra_info = [], []
	if paper_files is not None:
		for idx, paper in enumerate(paper_files):
			if str(paper)[-4:] != '.pdf':
				continue
			content_docs, extra_docs = self.read_single_paper(file_path=paper, show_progress=show_progress)
			contents += content_docs
			extra_info += extra_docs
	return contents, extra_info

`labridge.func_modules.paper.parse.paper_reader.PaperReader.read_single_paper(file_path, show_progress=True, extra_metadata=None)` ¶

Read a single pdf paper.

PARAMETER	DESCRIPTION
`file_path`	the path of pdf paper. TYPE: `Union[Path, str]`
`show_progress`	show parsing progress. TYPE: `bool` DEFAULT: `True`
`extra_metadata`	Existing metadata obtained by approaches such as arXiv API. TYPE: `dict` DEFAULT: `None`

RETURNS	DESCRIPTION
`Optional[Tuple[List[Document], List[Document]]]`	Tuple[List[Document], List[Document]]: The ingested content docs and extra docs. chunk_docs: the docs for retrieving, include information such as main text, methods. Might be None if nothing is parsed (auto_parse_paper fails.) extra_docs: docs that involve supplementary information such as references. Might be None.

Source code in labridge\func_modules\paper\parse\paper_reader.py

def read_single_paper(
	self,
	file_path: Union[Path, str],
	show_progress: bool = True,
	extra_metadata: dict = None,
) -> Optional[Tuple[List[Document], List[Document]]]:
	r"""
	Read a single pdf paper.

	Args:
		file_path (Union[Path, str]): the path of pdf paper.
		show_progress (bool): show parsing progress.
		extra_metadata (dict): Existing metadata obtained by approaches such as arXiv API.

	Returns:
		Tuple[List[Document], List[Document]]:
			The ingested content docs and extra docs.

			- chunk_docs: the docs for retrieving, include information such as main text, methods.
			Might be None if nothing is parsed (auto_parse_paper fails.)
			- extra_docs: docs that involve supplementary information such as references.
			Might be None.
	"""
	if isinstance(file_path, str):
		file_path = Path(file_path)
	if str(file_path)[-4:] != '.pdf':
		raise ValueError("Expect a PDF file.")
	if show_progress:
		print_text(f">>> Loading {file_path}", color="blue", end="\n")
	parsed_docs = auto_parse_paper(
		file_path=file_path,
		source_analyzer=self.source_analyzer,
		use_llm_for_source=self.use_llm_for_source
	)

	chunk_docs, extra_docs, metadata_docs = [], [], []
	for doc in parsed_docs:
		if doc.metadata[CONTENT_TYPE_NAME] in MetadataContents:
			metadata_docs.append(doc)
		elif doc.metadata[CONTENT_TYPE_NAME] in ChunkContents:
			chunk_docs.append(doc)
		else:
			extra_docs.append(doc)

	# metadata
	paper_metadata = dict()

	if self.extract_metadata:
		paper_metadata = self.metadata_extractor.extract_paper_metadata(
			pdf_path=file_path,
			extra_metadata=extra_metadata,
		)
		if paper_metadata is None:
			print(f"Loading DOI failed: {file_path}")
			return None

		for meta_doc in metadata_docs:
			metadata_name = meta_doc.metadata[CONTENT_TYPE_NAME]
			if metadata_name not in paper_metadata.keys():
				paper_metadata[metadata_name] = meta_doc.text

	possessor = self.get_paper_possessor(file_path)
	paper_metadata[PAPER_POSSESSOR] = possessor
	paper_metadata[PAPER_REL_FILE_PATH] = str(file_path.relative_to(self.root))

	for idx, doc in enumerate(parsed_docs):
		doc.metadata.update(paper_metadata)
		if self.filename_as_id:
			rel_path = str(file_path.relative_to(self.root))
			doc.id_ = f"{rel_path!s}_{doc.metadata[CONTENT_TYPE_NAME]}"
	return chunk_docs, extra_docs

Paper reader

labridge.func_modules.paper.parse.paper_reader ¶

labridge.func_modules.paper.parse.paper_reader.PaperReader ¶

labridge.func_modules.paper.parse.paper_reader.PaperReader.get_paper_possessor(paper_path) ¶

labridge.func_modules.paper.parse.paper_reader.PaperReader.read_papers(input_dir=None, input_files=None, show_progress=True) ¶

labridge.func_modules.paper.parse.paper_reader.PaperReader.read_single_paper(file_path, show_progress=True, extra_metadata=None) ¶

`labridge.func_modules.paper.parse.paper_reader` ¶

`labridge.func_modules.paper.parse.paper_reader.PaperReader` ¶

`labridge.func_modules.paper.parse.paper_reader.PaperReader.get_paper_possessor(paper_path)` ¶

`labridge.func_modules.paper.parse.paper_reader.PaperReader.read_papers(input_dir=None, input_files=None, show_progress=True)` ¶

`labridge.func_modules.paper.parse.paper_reader.PaperReader.read_single_paper(file_path, show_progress=True, extra_metadata=None)` ¶