跳转至

Default parser

labridge.func_modules.paper.parse.parsers.default_parser

labridge.func_modules.paper.parse.parsers.default_parser.DefaultPaperParser

The default paper parser will mark the whole paper content as 'MAINTEXT'

Source code in labridge\func_modules\paper\parse\parsers\default_parser.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class DefaultPaperParser:
	r"""
	The default paper parser will mark the whole paper content as 'MAINTEXT'
	"""
	def parse_paper(self, file_path: Union[str, Path]) -> List[Document]:
		r"""
		Parse the paper.

		Args:
			file_path (Union[str, Path]):

		Returns:
			List[Document]: The parsed documents.
		"""
		doc = pymupdf.open(file_path)
		pages = [page.get_text().encode("utf-8") for page in doc]
		paper_text = ''.join([text for text in pages])

		extra_info = {
			"total_pages": len(doc),
			CONTENT_TYPE_NAME: "MainText"
		}
		doc = Document(text=paper_text, extra_info=extra_info)
		return [doc,]

labridge.func_modules.paper.parse.parsers.default_parser.DefaultPaperParser.parse_paper(file_path)

Parse the paper.

PARAMETER DESCRIPTION
file_path

TYPE: Union[str, Path]

RETURNS DESCRIPTION
List[Document]

List[Document]: The parsed documents.

Source code in labridge\func_modules\paper\parse\parsers\default_parser.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def parse_paper(self, file_path: Union[str, Path]) -> List[Document]:
	r"""
	Parse the paper.

	Args:
		file_path (Union[str, Path]):

	Returns:
		List[Document]: The parsed documents.
	"""
	doc = pymupdf.open(file_path)
	pages = [page.get_text().encode("utf-8") for page in doc]
	paper_text = ''.join([text for text in pages])

	extra_info = {
		"total_pages": len(doc),
		CONTENT_TYPE_NAME: "MainText"
	}
	doc = Document(text=paper_text, extra_info=extra_info)
	return [doc,]