跳转至

Ieee parser

labridge.func_modules.paper.parse.parsers.ieee_parser

labridge.func_modules.paper.parse.parsers.ieee_parser.IEEEPaperParser

Bases: BasePaperParser

Parse the paper according to the IEEE template.

PARAMETER DESCRIPTION
separators

Each tuple includes the separators that separate two components. Defaults to IEEE_SEPARATORS.

TYPE: List[Tuple[str]] DEFAULT: None

content_names

Defaults to IEEE_CONTENT_NAMES.

TYPE: Dict[int, Tuple[str] DEFAULT: None

separator_tolerance

The tolerance of mismatch chars.

TYPE: int DEFAULT: 3

Source code in labridge\func_modules\paper\parse\parsers\ieee_parser.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class IEEEPaperParser(BasePaperParser):
	r"""
	Parse the paper according to the IEEE template.

	Args:
		separators (List[Tuple[str]]): Each tuple includes the separators that separate two components.
			Defaults to `IEEE_SEPARATORS`.
		content_names (Dict[int, Tuple[str]): Key: component index; Value: component name candidates.
			Defaults to `IEEE_CONTENT_NAMES`.
		separator_tolerance (int): The tolerance of mismatch chars.
	"""
	def __init__(
		self,
		separators: List[Tuple[str]] = None,
		content_names: Dict[int, Tuple[str]] = None,
		separator_tolerance: int = 3
	):
		separators = separators or IEEE_SEPARATORS
		content_names = content_names or IEEE_CONTENT_NAMES
		super().__init__(separators, content_names, separator_tolerance)

	def parse_title(self, file_path: Union[str, Path]) -> str:
		r""" Suggest to use LLM to extract title and other information. """
		doc = pymupdf.open(file_path)
		page = doc[0].get_textpage()

		page_blocks = page.extractBLOCKS()
		title = page_blocks[0][4].replace("\n", "")
		return title

labridge.func_modules.paper.parse.parsers.ieee_parser.IEEEPaperParser.parse_title(file_path)

Suggest to use LLM to extract title and other information.

Source code in labridge\func_modules\paper\parse\parsers\ieee_parser.py
49
50
51
52
53
54
55
56
def parse_title(self, file_path: Union[str, Path]) -> str:
	r""" Suggest to use LLM to extract title and other information. """
	doc = pymupdf.open(file_path)
	page = doc[0].get_textpage()

	page_blocks = page.extractBLOCKS()
	title = page_blocks[0][4].replace("\n", "")
	return title