跳转至

Source analyze

labridge.func_modules.paper.parse.extractors.source_analyze

labridge.func_modules.paper.parse.extractors.source_analyze.PaperSourceAnalyzer

This class analyze the source of the paper, such as 'Nature', 'IEEE'.

In default, the source analysis bases on keyword occurrence count. Also, LLM can be used to help analyzing the source.

PARAMETER DESCRIPTION
llm

The used LLM.

TYPE: LLM DEFAULT: None

service_context

The service context.

TYPE: ServiceContext DEFAULT: None

keyword_count_threshold

A PaperSource is selected as a candidate only if its corresponding keyword occurrence count exceed this threshold.

TYPE: int DEFAULT: 10

Source code in labridge\func_modules\paper\parse\extractors\source_analyze.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class PaperSourceAnalyzer:
	r"""
	This class analyze the source of the paper, such as 'Nature', 'IEEE'.

	In default, the source analysis bases on keyword occurrence count.
	Also, LLM can be used to help analyzing the source.

	Args:
		llm (LLM): The used LLM.
		service_context (ServiceContext): The service context.
		keyword_count_threshold (int): A PaperSource is selected as a candidate
			only if its corresponding keyword occurrence count exceed this threshold.
	"""
	def __init__(
		self,
		llm: LLM = None,
		service_context: ServiceContext = None,
		keyword_count_threshold: int = 10,
	):
		self.llm = llm or llm_from_settings_or_context(Settings, service_context)
		self.keyword_count_threshold = keyword_count_threshold

	def reader_analyze(self, paper_path: Union[Path, str]) -> PaperSource:
		"""
		Analyze the paper source using a structured pdf reader.

		Args:
			paper_path (Union[Path, str]): The paper path.

		Returns:
			PaperSource: The paper source.
		"""
		import PyPDF2

		with open(paper_path, 'rb') as file:
			fileReader = PyPDF2.PdfReader(file)
			file_info = fileReader.trailer['/Info']

		source = None
		if '/Subject' in file_info.keys():
			src_string = file_info['/Subject']
			if len(src_string) >= len(PaperSource.NATURE):
				source = PaperSource.IEEE
				for start in range(len(src_string) - len(PaperSource.NATURE) + 1):
					if src_string[start: start + len(PaperSource.NATURE)].upper() == PaperSource.NATURE.upper():
						source = PaperSource.NATURE
		return source

	def llm_analyze(self, paper_path: Union[Path, str]) -> PaperSource:
		""" TODO: using llm. """
		return PaperSource.DEFAULT

	def keyword_analyze(self, paper_path: Union[Path, str]) -> PaperSource:
		r"""
		Analyze the paper source based on keyword occurrence count.

		Args:
			paper_path (Union[Path, str]): The paper path.

		Returns:
			PaperSource: The analyzed paper source.
		"""
		import pymupdf
		import re

		doc = pymupdf.open(paper_path)
		pages = [page.get_text() for page in doc]

		""" Searching in the text."""
		source = None
		count = 0
		for page_text in pages:
			for t in re.findall(r"\w+", page_text):
				if t.strip().upper() == PaperSource.NATURE.upper():
					count += 1
		if count > self.keyword_count_threshold:
			source = PaperSource.NATURE
		else:
			source = PaperSource.IEEE
		return source

	def analyze_source(self, paper_path: Union[Path, str], use_llm = False) -> PaperSource:
		r"""
		Sequentially use `reader_analyze`, `keyword_analyze`, and `llm_analyze` to analyze the paper source

		Args:
			paper_path (Union[Path, str]): The paper path.
			use_llm (bool): Whether to use `llm_analyze`.

		Returns:
			PaperSource
		"""
		source = self.reader_analyze(paper_path)
		if source is None:
			source = self.keyword_analyze(paper_path)
		if source is None and use_llm:
			source = self.llm_analyze(paper_path)
		if source is None:
			source = PaperSource.DEFAULT
		return source

labridge.func_modules.paper.parse.extractors.source_analyze.PaperSourceAnalyzer.analyze_source(paper_path, use_llm=False)

Sequentially use reader_analyze, keyword_analyze, and llm_analyze to analyze the paper source

PARAMETER DESCRIPTION
paper_path

The paper path.

TYPE: Union[Path, str]

use_llm

Whether to use llm_analyze.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
PaperSource

PaperSource

Source code in labridge\func_modules\paper\parse\extractors\source_analyze.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def analyze_source(self, paper_path: Union[Path, str], use_llm = False) -> PaperSource:
	r"""
	Sequentially use `reader_analyze`, `keyword_analyze`, and `llm_analyze` to analyze the paper source

	Args:
		paper_path (Union[Path, str]): The paper path.
		use_llm (bool): Whether to use `llm_analyze`.

	Returns:
		PaperSource
	"""
	source = self.reader_analyze(paper_path)
	if source is None:
		source = self.keyword_analyze(paper_path)
	if source is None and use_llm:
		source = self.llm_analyze(paper_path)
	if source is None:
		source = PaperSource.DEFAULT
	return source

labridge.func_modules.paper.parse.extractors.source_analyze.PaperSourceAnalyzer.keyword_analyze(paper_path)

Analyze the paper source based on keyword occurrence count.

PARAMETER DESCRIPTION
paper_path

The paper path.

TYPE: Union[Path, str]

RETURNS DESCRIPTION
PaperSource

The analyzed paper source.

TYPE: PaperSource

Source code in labridge\func_modules\paper\parse\extractors\source_analyze.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def keyword_analyze(self, paper_path: Union[Path, str]) -> PaperSource:
	r"""
	Analyze the paper source based on keyword occurrence count.

	Args:
		paper_path (Union[Path, str]): The paper path.

	Returns:
		PaperSource: The analyzed paper source.
	"""
	import pymupdf
	import re

	doc = pymupdf.open(paper_path)
	pages = [page.get_text() for page in doc]

	""" Searching in the text."""
	source = None
	count = 0
	for page_text in pages:
		for t in re.findall(r"\w+", page_text):
			if t.strip().upper() == PaperSource.NATURE.upper():
				count += 1
	if count > self.keyword_count_threshold:
		source = PaperSource.NATURE
	else:
		source = PaperSource.IEEE
	return source

labridge.func_modules.paper.parse.extractors.source_analyze.PaperSourceAnalyzer.llm_analyze(paper_path)

Source code in labridge\func_modules\paper\parse\extractors\source_analyze.py
64
65
66
def llm_analyze(self, paper_path: Union[Path, str]) -> PaperSource:
	""" TODO: using llm. """
	return PaperSource.DEFAULT

labridge.func_modules.paper.parse.extractors.source_analyze.PaperSourceAnalyzer.reader_analyze(paper_path)

Analyze the paper source using a structured pdf reader.

PARAMETER DESCRIPTION
paper_path

The paper path.

TYPE: Union[Path, str]

RETURNS DESCRIPTION
PaperSource

The paper source.

TYPE: PaperSource

Source code in labridge\func_modules\paper\parse\extractors\source_analyze.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def reader_analyze(self, paper_path: Union[Path, str]) -> PaperSource:
	"""
	Analyze the paper source using a structured pdf reader.

	Args:
		paper_path (Union[Path, str]): The paper path.

	Returns:
		PaperSource: The paper source.
	"""
	import PyPDF2

	with open(paper_path, 'rb') as file:
		fileReader = PyPDF2.PdfReader(file)
		file_info = fileReader.trailer['/Info']

	source = None
	if '/Subject' in file_info.keys():
		src_string = file_info['/Subject']
		if len(src_string) >= len(PaperSource.NATURE):
			source = PaperSource.IEEE
			for start in range(len(src_string) - len(PaperSource.NATURE) + 1):
				if src_string[start: start + len(PaperSource.NATURE)].upper() == PaperSource.NATURE.upper():
					source = PaperSource.NATURE
	return source