跳转至

Base

labridge.func_modules.paper.parse.parsers.base

labridge.func_modules.paper.parse.parsers.base.BasePaperParser

This is the base paper parser. The Parser separates a paper into subcomponents according to several separators.

PARAMETER DESCRIPTION
separators

Each tuple includes the separators that separate two components.

TYPE: List[Tuple[str]]

content_names

TYPE: Dict[int, Tuple[str]

separator_tolerance

The tolerance of mismatch chars.

TYPE: int DEFAULT: 3

Source code in labridge\func_modules\paper\parse\parsers\base.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
class BasePaperParser:
	r"""
	This is the base paper parser.
	The Parser separates a paper into subcomponents according to several separators.

	Args:
		separators (List[Tuple[str]]): Each tuple includes the separators that separate two components.
		content_names (Dict[int, Tuple[str]): Key: component index; Value: component name candidates.
		separator_tolerance (int): The tolerance of mismatch chars.
	"""
	def __init__(
		self,
		separators: List[Tuple[str]],
		content_names: Dict[int, Tuple[str]],
		separator_tolerance: int = 3
	):
		self.separators = separators
		self.content_names = content_names
		self.separator_tolerance = separator_tolerance

	@abstractmethod
	def parse_title(self, file_path: Union[str, Path]) -> str:
		...

	def to_documents(
		self,
		parsed_components: List[str],
		extra_info: Dict[str, str],
	) -> List[Document]:
		r"""
		Transform the parsed components to Documents.

		Args:
			parsed_components (List[str]): The separated component strings.
			extra_info (Dict[str, str]): The extra information will be recorded in the Document's metadata.

		Returns:
			List[Document]: The parsed Documents.
		"""
		component_names = self.content_names[len(parsed_components)]
		documents = []

		# merge texts with the same name.
		merged_component_names = []
		merged_components = []
		for idx, name in enumerate(component_names):
			if name not in merged_component_names:
				merged_component_names.append(name)
				merged_components.append(parsed_components[idx])
			else:
				name_idx = merged_component_names.index(name)
				merged_components[name_idx] += parsed_components[idx]

		for idx, component in enumerate(merged_components):
			doc_info = {CONTENT_TYPE_NAME: merged_component_names[idx]}
			doc_info.update(extra_info)
			doc = Document(text=merged_components[idx], extra_info=doc_info)
			documents.append(doc)
		return documents

	def parse_paper(self, file_path: Union[str, Path]) -> List[Document]:
		r"""
		Split the article into main text, methods, extra info (references, extended data.) according to specific separators.
		For example, separators for Nature are:

		Example:
			```python
			>>> [
			... 	("Online content", ),
			... 	("Methods", ),
			... 	("Data availability", "Code availability", "References")
			... ]
			```

		Args:
			file_path (Union[str, Path]): The paper path.

		Returns:
			Tuple[List, Optional[str]]:

				- The separated paper text (List[str]): For example: [Main text, References 1, Methods, References 2]
				- The title (Optional[str]): Might be None if PyMuPDF failed to extract the doc toc. In that case you may
				need to search for LLM's help to extract it.
		"""
		if not isinstance(file_path, str) and not isinstance(file_path, Path):
			raise TypeError("file_path must be a string or Path.")

		separators = self.separators
		doc = pymupdf.open(file_path)
		pages = [page.get_textpage() for page in doc]

		text_blocks = []
		sep_p = 0
		components = []
		text_in_block = 4
		for idx, text_page in enumerate(pages):
			page_blocks = text_page.extractBLOCKS()
			if idx == 0:
				page_blocks.pop(0)
			for each_block in page_blocks:
				sep_idx = get_sep_idx(each_block[text_in_block], separators, self.separator_tolerance)
				if sep_p < len(separators) and sep_idx >= sep_p:
					text_list = [block[text_in_block] for block in text_blocks]
					text = ''.join(text_list)
					components.append(text)
					sep_p = sep_idx + 1
					text_blocks = []
				text_blocks.append(each_block)
		else:
			text_list = [block[text_in_block] for block in text_blocks]
			text = ''.join(text_list)
			components.append(text)

		extra_info = {
			"total_pages": len(doc),
			"file_path": str(file_path)
		}

		documents = self.to_documents(parsed_components=components, extra_info=extra_info)
		return documents

labridge.func_modules.paper.parse.parsers.base.BasePaperParser.parse_paper(file_path)

Split the article into main text, methods, extra info (references, extended data.) according to specific separators. For example, separators for Nature are:

Example
>>> [
...     ("Online content", ),
...     ("Methods", ),
...     ("Data availability", "Code availability", "References")
... ]
PARAMETER DESCRIPTION
file_path

The paper path.

TYPE: Union[str, Path]

RETURNS DESCRIPTION
List[Document]

Tuple[List, Optional[str]]:

  • The separated paper text (List[str]): For example: [Main text, References 1, Methods, References 2]
  • The title (Optional[str]): Might be None if PyMuPDF failed to extract the doc toc. In that case you may need to search for LLM's help to extract it.
Source code in labridge\func_modules\paper\parse\parsers\base.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def parse_paper(self, file_path: Union[str, Path]) -> List[Document]:
	r"""
	Split the article into main text, methods, extra info (references, extended data.) according to specific separators.
	For example, separators for Nature are:

	Example:
		```python
		>>> [
		... 	("Online content", ),
		... 	("Methods", ),
		... 	("Data availability", "Code availability", "References")
		... ]
		```

	Args:
		file_path (Union[str, Path]): The paper path.

	Returns:
		Tuple[List, Optional[str]]:

			- The separated paper text (List[str]): For example: [Main text, References 1, Methods, References 2]
			- The title (Optional[str]): Might be None if PyMuPDF failed to extract the doc toc. In that case you may
			need to search for LLM's help to extract it.
	"""
	if not isinstance(file_path, str) and not isinstance(file_path, Path):
		raise TypeError("file_path must be a string or Path.")

	separators = self.separators
	doc = pymupdf.open(file_path)
	pages = [page.get_textpage() for page in doc]

	text_blocks = []
	sep_p = 0
	components = []
	text_in_block = 4
	for idx, text_page in enumerate(pages):
		page_blocks = text_page.extractBLOCKS()
		if idx == 0:
			page_blocks.pop(0)
		for each_block in page_blocks:
			sep_idx = get_sep_idx(each_block[text_in_block], separators, self.separator_tolerance)
			if sep_p < len(separators) and sep_idx >= sep_p:
				text_list = [block[text_in_block] for block in text_blocks]
				text = ''.join(text_list)
				components.append(text)
				sep_p = sep_idx + 1
				text_blocks = []
			text_blocks.append(each_block)
	else:
		text_list = [block[text_in_block] for block in text_blocks]
		text = ''.join(text_list)
		components.append(text)

	extra_info = {
		"total_pages": len(doc),
		"file_path": str(file_path)
	}

	documents = self.to_documents(parsed_components=components, extra_info=extra_info)
	return documents

labridge.func_modules.paper.parse.parsers.base.BasePaperParser.to_documents(parsed_components, extra_info)

Transform the parsed components to Documents.

PARAMETER DESCRIPTION
parsed_components

The separated component strings.

TYPE: List[str]

extra_info

The extra information will be recorded in the Document's metadata.

TYPE: Dict[str, str]

RETURNS DESCRIPTION
List[Document]

List[Document]: The parsed Documents.

Source code in labridge\func_modules\paper\parse\parsers\base.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def to_documents(
	self,
	parsed_components: List[str],
	extra_info: Dict[str, str],
) -> List[Document]:
	r"""
	Transform the parsed components to Documents.

	Args:
		parsed_components (List[str]): The separated component strings.
		extra_info (Dict[str, str]): The extra information will be recorded in the Document's metadata.

	Returns:
		List[Document]: The parsed Documents.
	"""
	component_names = self.content_names[len(parsed_components)]
	documents = []

	# merge texts with the same name.
	merged_component_names = []
	merged_components = []
	for idx, name in enumerate(component_names):
		if name not in merged_component_names:
			merged_component_names.append(name)
			merged_components.append(parsed_components[idx])
		else:
			name_idx = merged_component_names.index(name)
			merged_components[name_idx] += parsed_components[idx]

	for idx, component in enumerate(merged_components):
		doc_info = {CONTENT_TYPE_NAME: merged_component_names[idx]}
		doc_info.update(extra_info)
		doc = Document(text=merged_components[idx], extra_info=doc_info)
		documents.append(doc)
	return documents