跳转至

Nature parser

labridge.func_modules.paper.parse.parsers.nature_parser

labridge.func_modules.paper.parse.parsers.nature_parser.NaturePaperParser

Bases: BasePaperParser

Parse the paper according to the Nature template.

PARAMETER DESCRIPTION
separators

Each tuple includes the separators that separate two components. Defaults to NATURE_SEPARATORS.

TYPE: List[Tuple[str]] DEFAULT: None

content_names

Defaults to NATURE_CONTENT_NAMES.

TYPE: Dict[int, Tuple[str] DEFAULT: None

separator_tolerance

The tolerance of mismatch chars.

TYPE: int DEFAULT: 3

Source code in labridge\func_modules\paper\parse\parsers\nature_parser.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class NaturePaperParser(BasePaperParser):
	r"""
	Parse the paper according to the Nature template.

	Args:
		separators (List[Tuple[str]]): Each tuple includes the separators that separate two components.
			Defaults to `NATURE_SEPARATORS`.
		content_names (Dict[int, Tuple[str]): Key: component index; Value: component name candidates.
			Defaults to `NATURE_CONTENT_NAMES`.
		separator_tolerance (int): The tolerance of mismatch chars.
	"""
	def __init__(self,
				 separators: List[Tuple[str]] = None,
				 content_names: Dict[int, Tuple[str]] = None,
				 separator_tolerance: int = 3):
		separators = separators or NATURE_SEPARATORS
		content_names = content_names or NATURE_CONTENT_NAMES
		super().__init__(separators, content_names, separator_tolerance)

	def parse_title(self, file_path: Union[str, Path]) -> str:
		r""" Suggest to use LLM to extract title and other information. """
		doc = pymupdf.open(file_path)
		toc = doc.get_toc()
		title = None
		try:
			while isinstance(toc[0], list):
				toc = toc[0]
				title = toc[1]
		except IndexError:
			print(f">>> PyMupdf failed to get toc from {file_path}")
		return title

labridge.func_modules.paper.parse.parsers.nature_parser.NaturePaperParser.parse_title(file_path)

Suggest to use LLM to extract title and other information.

Source code in labridge\func_modules\paper\parse\parsers\nature_parser.py
48
49
50
51
52
53
54
55
56
57
58
59
def parse_title(self, file_path: Union[str, Path]) -> str:
	r""" Suggest to use LLM to extract title and other information. """
	doc = pymupdf.open(file_path)
	toc = doc.get_toc()
	title = None
	try:
		while isinstance(toc[0], list):
			toc = toc[0]
			title = toc[1]
	except IndexError:
		print(f">>> PyMupdf failed to get toc from {file_path}")
	return title