跳转至

Metadata extract

labridge.func_modules.paper.parse.extractors.metadata_extract

labridge.func_modules.paper.parse.extractors.metadata_extract.PaperMetadataExtractor

This class uses LLM to extracts metadata from a paper.

The LLM is instructed to extract all DEFAULT_NECESSARY_METADATA. The LLM is encourages to extract DEFAULT_OPTIONAL_METADATA.

PARAMETER DESCRIPTION
llm

The used LLM.

TYPE: LLM DEFAULT: None

necessary_metadata

The LLM is instructed to extract all necessary_metadata. Defaults to DEFAULT_NECESSARY_METADATA.

TYPE: Dict[str, str] DEFAULT: None

optional_metadata

The LLM is encourages to extract optional_metadata. Defaults to DEFAULT_OPTIONAL_METADATA.

TYPE: Dict[str, str] DEFAULT: None

max_retry_times

The maximum retry times for extracting necessary_metadata.

TYPE: int DEFAULT: 2

service_context

The context including llm, embed_model, etc.

TYPE: ServiceContext DEFAULT: None

Source code in labridge\func_modules\paper\parse\extractors\metadata_extract.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
class PaperMetadataExtractor:
	r"""
	This class uses LLM to extracts metadata from a paper.

	The LLM is instructed to extract all `DEFAULT_NECESSARY_METADATA`.
	The LLM is encourages to extract `DEFAULT_OPTIONAL_METADATA`.

	Args:
		llm (LLM): The used LLM.
		necessary_metadata (Dict[str, str]): The LLM is instructed to extract all necessary_metadata.
			Defaults to `DEFAULT_NECESSARY_METADATA`.
		optional_metadata (Dict[str, str]): The LLM is encourages to extract optional_metadata.
			Defaults to `DEFAULT_OPTIONAL_METADATA`.
		max_retry_times (int): The maximum retry times for extracting necessary_metadata.
		service_context (ServiceContext): The context including llm, embed_model, etc.
	"""

	def __init__(
		self,
		llm: LLM = None,
		necessary_metadata: Dict[str, str] = None,
		optional_metadata: Dict[str, str] = None,
		max_retry_times: int = 2,
		service_context: ServiceContext = None,
	):
		self.necessary_metadata = necessary_metadata or DEFAULT_NECESSARY_METADATA
		self.optional_metadata = optional_metadata or DEFAULT_OPTIONAL_METADATA
		if llm is None:
			self.llm = llm_from_settings_or_context(Settings, service_context)
		else:
			self.llm = llm

		self.prompt_tmpl = self.get_prompt_tmpl()
		self.doi_worker = DOIWorker()
		self.query_engine = SingleQueryEngine(llm=llm, prompt_tmpl=self.prompt_tmpl)
		self.max_retry_times = max_retry_times

	def _default_transformations(self) -> List[TransformComponent]:
		return [
				SentenceSplitter(chunk_size=1024, chunk_overlap=256, include_metadata=True),
				KeywordExtractor(keywords=5, llm=self.llm),
			]

	def get_prompt_tmpl(
		self,
		necessary_metadata: Dict[str, str] = None,
		optional_metadata: Dict[str, str] = None,
	) -> str:
		r"""
		This function is used to get the prompt template used for extracting metadata, according to the
		`necessary_metadata` and `optional_metadata`.

		Args:
			necessary_metadata (Dict[str, str]): necessary metadata, Defaults to `self.necessary_metadata`.
			optional_metadata (Dict[str, str]): optional metadata, Defaults to `self.optional_metadata`.
		"""
		necessary_metadata = necessary_metadata or self.necessary_metadata
		optional_metadata = optional_metadata or self.optional_metadata

		tmpl = (
			"Here is the first page of a research paper. "
			"You need try to extract some information from it.\n\n"
			"The NECESSARY metadata that you MUST extract contain:\n"
		)
		necessary_metadata_names = ', '.join(list(necessary_metadata.keys()))
		tmpl += necessary_metadata_names
		tmpl += (
			"\n\nIt is better to extract the following metadata,"
			"But if a optional metadata does not appear in the paper, you do not need to output it.\n"
		)
		optional_metadata_names = ', '.join(list(optional_metadata.keys()))
		tmpl += optional_metadata_names
		tmpl += (
			"\n\n"
			"Here are some suggestions for you to extract these metadata:"
		)
		tmpl += "\n\nSuggestions for extracting NECESSARY metadata:\n"

		for key in necessary_metadata.keys():
			tmpl += f"**{key}**: {necessary_metadata[key]}\n"

		tmpl += (
			"\n\n"
			"Suggestions for extracting optional metadata:\n"
		)
		for key in optional_metadata.keys():
			tmpl += f"**{key}**: {optional_metadata[key]}\n"
		tmpl += (
			"\n\n"
			"The first page of the paper is as follows:\n"
			"{}"
		)
		tmpl += (
			"\n\nOutput your extracted metadata as the following FORMAT:\n"
			"**metadata_name**: <extracted corresponding metadata>\n\n"
			"For example:\n"
			"**Title**: Chaotic dynamics in memristor\n"
			"**DOI**: 10.2003-233/1245\n\n"
			"You should output valid extracted metadata, DO NOT output the given example:\n"
			"**Title**: Chaotic dynamics in memristor\n"
			"**DOI**: 10.2003-233/1245\n\n"
			"Now list your extracted metadata as follows:\n\n"
		)

		for key in necessary_metadata.keys():
			tmpl += f"**{key}**: \n\n"
		for key in optional_metadata.keys():
			tmpl += f"**{key}**: \n\n"
		return tmpl

	def _set_query_prompt(
		self,
		necessary_metadata: Dict[str, str] = None,
		optional_metadata: Dict[str, str] = None,
	):
		r""" If both `necessary_metadata` and `optional_metadata` are None, set the default prompt. """
		self.query_engine.prompt_tmpl = self.get_prompt_tmpl(necessary_metadata, optional_metadata)

	def metadata_output_format(self, llm_answer: str) -> Dict[str, str]:
		r"""
		The LLM is supposed to answer like this:

		- **metadata_name 1**: extracted metadata 1.
		- **metadata_name 2**: extracted metadata 2.

		Extract a metadata dictionary from the answer of llm.

		Args:
			llm_answer (str): The LLM Output.
		"""
		str_list = llm_answer.split("**")
		metadata = dict()

		idx = 0
		# key: 1 v: 2
		while 2 * idx + 2 < len(str_list):
			key = str_list[2 * idx + 1]
			val = str_list[2 * idx + 2]
			key = key.replace("\n", "")
			val = val.replace("\n", "")
			if key in self.necessary_metadata.keys() or key in self.optional_metadata.keys():
				metadata[key] = val.replace(": ", "", 1)
			idx += 1
		return metadata

	def _extract_metadata(
		self,
		pdf_path: Union[Path, str] = None,
		pdf_docs: List[Document] = None,
		necessary_metadata: Dict[str, str] = None,
		optional_metadata: Dict[str, str] = None,
	) -> Dict[str, str]:
		r"""
		Use the LLM to extract metadata of a paper.

		Args:
			pdf_path: (Union[Path, str]): the path of a pdf paper.
			pdf_docs (List[Document]): the documents of a pdf paper.
			necessary_metadata (Dict[str, str]):
			optional_metadata (optional_metadata):

		Returns:
			metadata (Dict[str, str]): The extracted meta data.
		"""

		if pdf_path is not None:
			pdf_docs = PyMuPDFReader().load_data(file_path=pdf_path)
		elif pdf_docs is None:
			raise ValueError("pdf_path and pdf_docs can not both be None.")

		first_page = pdf_docs[0].text
		self._set_query_prompt(necessary_metadata, optional_metadata)
		response = self.query_engine.query(first_page)
		# reset prompt
		self._set_query_prompt()
		extract_text = response.response
		metadata = self.metadata_output_format(extract_text)
		return metadata

	def _lacked_metadata(self, paper_metadata: Dict[str, str]) -> Tuple[Dict, Dict]:
		r"""
		Return current lacked metadata.

		Args:
			paper_metadata (Dict[str, str]): Extracted metadata.

		Returns:
			Tuple[Dict, Dict]: The lacked necessary metadata and lacked optional metadata
		"""
		lack_necessary_keys = set(self.necessary_metadata.keys()) - set(paper_metadata.keys())
		lack_optional_keys = set(self.optional_metadata.keys()) - set(paper_metadata.keys())

		lack_necessary_metadata = dict()
		lack_optional_metadata = dict()
		for key in lack_necessary_keys:
			lack_necessary_metadata.update({key: self.necessary_metadata[key]})

		for key in lack_optional_keys:
			lack_optional_metadata.update({key: self.optional_metadata[key]})

		return lack_necessary_metadata, lack_optional_metadata

	def extract_paper_metadata(
		self,
		pdf_path: Union[Path, str] = None,
		pdf_docs: List[Document] = None,
		show_progress: bool = True,
		extra_metadata: dict = None,
	) -> Optional[Dict[str, str]]:
		r"""
		Extract required metadata from a paper.
		Title and DOI is necessary, we will use the CrossRef API to get the DOI of a paper according to its title.
		If any of them misses, this method will return None.

		Args:
			pdf_path (Union[Path, str]): The file path of the paper.
			pdf_docs (List[Document]): If the pdf_path is not provided, the provided pdf_docs will be used.
				pdf_docs and pdf_path can not all be None.
			show_progress (bool): Whether to show the inner progress.
			extra_metadata (dict): Existing metadata obtained by approaches such as arXiv API.

		Returns:
			Dict[str, str]: The extracted metadata.
		"""
		if pdf_path:
			pdf_docs = PyMuPDFReader().load_data(file_path=pdf_path)
		elif pdf_docs is None:
			raise ValueError("pdf_path and pdf_docs can not both be None.")

		paper_metadata = extra_metadata or dict()
		lack_necessary_metadata, _ = self._lacked_metadata(paper_metadata)
		retry_count = 0
		while len(lack_necessary_metadata.keys()) > 0 and retry_count <= self.max_retry_times:
			new_metadata = self._extract_metadata(
				pdf_docs=pdf_docs,
				necessary_metadata=lack_necessary_metadata,
				optional_metadata=self.optional_metadata,
			)
			retry_count += 1
			if show_progress:
				print_text(f">>>\tExtract try idx {retry_count}: {list(new_metadata.keys())}", color="cyan", end="\n")
			paper_metadata.update(new_metadata)
			lack_necessary_metadata, _ = self._lacked_metadata(paper_metadata)

		title = paper_metadata.get(PAPER_TITLE, None)
		if title is None:
			return None

		# find doi according to title
		doi = paper_metadata.get(PAPER_DOI, None)
		doi = self.doi_worker.find_doi_by_title(title=title, input_doi=doi)
		if doi is None:
			print("DOI find fails.")
			return None

		paper_metadata[PAPER_DOI] = doi
		return paper_metadata

labridge.func_modules.paper.parse.extractors.metadata_extract.PaperMetadataExtractor.extract_paper_metadata(pdf_path=None, pdf_docs=None, show_progress=True, extra_metadata=None)

Extract required metadata from a paper. Title and DOI is necessary, we will use the CrossRef API to get the DOI of a paper according to its title. If any of them misses, this method will return None.

PARAMETER DESCRIPTION
pdf_path

The file path of the paper.

TYPE: Union[Path, str] DEFAULT: None

pdf_docs

If the pdf_path is not provided, the provided pdf_docs will be used. pdf_docs and pdf_path can not all be None.

TYPE: List[Document] DEFAULT: None

show_progress

Whether to show the inner progress.

TYPE: bool DEFAULT: True

extra_metadata

Existing metadata obtained by approaches such as arXiv API.

TYPE: dict DEFAULT: None

RETURNS DESCRIPTION
Optional[Dict[str, str]]

Dict[str, str]: The extracted metadata.

Source code in labridge\func_modules\paper\parse\extractors\metadata_extract.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def extract_paper_metadata(
	self,
	pdf_path: Union[Path, str] = None,
	pdf_docs: List[Document] = None,
	show_progress: bool = True,
	extra_metadata: dict = None,
) -> Optional[Dict[str, str]]:
	r"""
	Extract required metadata from a paper.
	Title and DOI is necessary, we will use the CrossRef API to get the DOI of a paper according to its title.
	If any of them misses, this method will return None.

	Args:
		pdf_path (Union[Path, str]): The file path of the paper.
		pdf_docs (List[Document]): If the pdf_path is not provided, the provided pdf_docs will be used.
			pdf_docs and pdf_path can not all be None.
		show_progress (bool): Whether to show the inner progress.
		extra_metadata (dict): Existing metadata obtained by approaches such as arXiv API.

	Returns:
		Dict[str, str]: The extracted metadata.
	"""
	if pdf_path:
		pdf_docs = PyMuPDFReader().load_data(file_path=pdf_path)
	elif pdf_docs is None:
		raise ValueError("pdf_path and pdf_docs can not both be None.")

	paper_metadata = extra_metadata or dict()
	lack_necessary_metadata, _ = self._lacked_metadata(paper_metadata)
	retry_count = 0
	while len(lack_necessary_metadata.keys()) > 0 and retry_count <= self.max_retry_times:
		new_metadata = self._extract_metadata(
			pdf_docs=pdf_docs,
			necessary_metadata=lack_necessary_metadata,
			optional_metadata=self.optional_metadata,
		)
		retry_count += 1
		if show_progress:
			print_text(f">>>\tExtract try idx {retry_count}: {list(new_metadata.keys())}", color="cyan", end="\n")
		paper_metadata.update(new_metadata)
		lack_necessary_metadata, _ = self._lacked_metadata(paper_metadata)

	title = paper_metadata.get(PAPER_TITLE, None)
	if title is None:
		return None

	# find doi according to title
	doi = paper_metadata.get(PAPER_DOI, None)
	doi = self.doi_worker.find_doi_by_title(title=title, input_doi=doi)
	if doi is None:
		print("DOI find fails.")
		return None

	paper_metadata[PAPER_DOI] = doi
	return paper_metadata

labridge.func_modules.paper.parse.extractors.metadata_extract.PaperMetadataExtractor.get_prompt_tmpl(necessary_metadata=None, optional_metadata=None)

This function is used to get the prompt template used for extracting metadata, according to the necessary_metadata and optional_metadata.

PARAMETER DESCRIPTION
necessary_metadata

necessary metadata, Defaults to self.necessary_metadata.

TYPE: Dict[str, str] DEFAULT: None

optional_metadata

optional metadata, Defaults to self.optional_metadata.

TYPE: Dict[str, str] DEFAULT: None

Source code in labridge\func_modules\paper\parse\extractors\metadata_extract.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def get_prompt_tmpl(
	self,
	necessary_metadata: Dict[str, str] = None,
	optional_metadata: Dict[str, str] = None,
) -> str:
	r"""
	This function is used to get the prompt template used for extracting metadata, according to the
	`necessary_metadata` and `optional_metadata`.

	Args:
		necessary_metadata (Dict[str, str]): necessary metadata, Defaults to `self.necessary_metadata`.
		optional_metadata (Dict[str, str]): optional metadata, Defaults to `self.optional_metadata`.
	"""
	necessary_metadata = necessary_metadata or self.necessary_metadata
	optional_metadata = optional_metadata or self.optional_metadata

	tmpl = (
		"Here is the first page of a research paper. "
		"You need try to extract some information from it.\n\n"
		"The NECESSARY metadata that you MUST extract contain:\n"
	)
	necessary_metadata_names = ', '.join(list(necessary_metadata.keys()))
	tmpl += necessary_metadata_names
	tmpl += (
		"\n\nIt is better to extract the following metadata,"
		"But if a optional metadata does not appear in the paper, you do not need to output it.\n"
	)
	optional_metadata_names = ', '.join(list(optional_metadata.keys()))
	tmpl += optional_metadata_names
	tmpl += (
		"\n\n"
		"Here are some suggestions for you to extract these metadata:"
	)
	tmpl += "\n\nSuggestions for extracting NECESSARY metadata:\n"

	for key in necessary_metadata.keys():
		tmpl += f"**{key}**: {necessary_metadata[key]}\n"

	tmpl += (
		"\n\n"
		"Suggestions for extracting optional metadata:\n"
	)
	for key in optional_metadata.keys():
		tmpl += f"**{key}**: {optional_metadata[key]}\n"
	tmpl += (
		"\n\n"
		"The first page of the paper is as follows:\n"
		"{}"
	)
	tmpl += (
		"\n\nOutput your extracted metadata as the following FORMAT:\n"
		"**metadata_name**: <extracted corresponding metadata>\n\n"
		"For example:\n"
		"**Title**: Chaotic dynamics in memristor\n"
		"**DOI**: 10.2003-233/1245\n\n"
		"You should output valid extracted metadata, DO NOT output the given example:\n"
		"**Title**: Chaotic dynamics in memristor\n"
		"**DOI**: 10.2003-233/1245\n\n"
		"Now list your extracted metadata as follows:\n\n"
	)

	for key in necessary_metadata.keys():
		tmpl += f"**{key}**: \n\n"
	for key in optional_metadata.keys():
		tmpl += f"**{key}**: \n\n"
	return tmpl

labridge.func_modules.paper.parse.extractors.metadata_extract.PaperMetadataExtractor.metadata_output_format(llm_answer)

The LLM is supposed to answer like this:

  • metadata_name 1: extracted metadata 1.
  • metadata_name 2: extracted metadata 2.

Extract a metadata dictionary from the answer of llm.

PARAMETER DESCRIPTION
llm_answer

The LLM Output.

TYPE: str

Source code in labridge\func_modules\paper\parse\extractors\metadata_extract.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def metadata_output_format(self, llm_answer: str) -> Dict[str, str]:
	r"""
	The LLM is supposed to answer like this:

	- **metadata_name 1**: extracted metadata 1.
	- **metadata_name 2**: extracted metadata 2.

	Extract a metadata dictionary from the answer of llm.

	Args:
		llm_answer (str): The LLM Output.
	"""
	str_list = llm_answer.split("**")
	metadata = dict()

	idx = 0
	# key: 1 v: 2
	while 2 * idx + 2 < len(str_list):
		key = str_list[2 * idx + 1]
		val = str_list[2 * idx + 2]
		key = key.replace("\n", "")
		val = val.replace("\n", "")
		if key in self.necessary_metadata.keys() or key in self.optional_metadata.keys():
			metadata[key] = val.replace(": ", "", 1)
		idx += 1
	return metadata