55from nltk .tokenize import sent_tokenize
66from omniparse .web .model_loader import load_nltk_punkt
77
8+
89# Define the abstract base class for chunking strategies
910class ChunkingStrategy (ABC ):
10-
1111 @abstractmethod
1212 def chunk (self , text : str ) -> list :
1313 """
1414 Abstract method to chunk the given text.
1515 """
1616 pass
17-
17+
18+
1819# Regex-based chunking
1920class RegexChunking (ChunkingStrategy ):
2021 def __init__ (self , patterns = None , ** kwargs ):
2122 if patterns is None :
22- patterns = [r' \n\n' ] # Default split pattern
23+ patterns = [r" \n\n" ] # Default split pattern
2324 self .patterns = patterns
2425
2526 def chunk (self , text : str ) -> list :
@@ -30,24 +31,26 @@ def chunk(self, text: str) -> list:
3031 new_paragraphs .extend (re .split (pattern , paragraph ))
3132 paragraphs = new_paragraphs
3233 return paragraphs
33-
34- # NLP-based sentence chunking
34+
35+
36+ # NLP-based sentence chunking
3537class NlpSentenceChunking (ChunkingStrategy ):
3638 def __init__ (self , ** kwargs ):
3739 load_nltk_punkt ()
3840 pass
3941
40- def chunk (self , text : str ) -> list :
42+ def chunk (self , text : str ) -> list :
4143 sentences = sent_tokenize (text )
42- sens = [sent .strip () for sent in sentences ]
43-
44+ sens = [sent .strip () for sent in sentences ]
45+
4446 return list (set (sens ))
45-
47+
48+
4649# Topic-based segmentation using TextTiling
4750class TopicSegmentationChunking (ChunkingStrategy ):
48-
4951 def __init__ (self , num_keywords = 3 , ** kwargs ):
5052 import nltk as nl
53+
5154 self .tokenizer = nl .toknize .TextTilingTokenizer ()
5255 self .num_keywords = num_keywords
5356
@@ -59,8 +62,14 @@ def chunk(self, text: str) -> list:
5962 def extract_keywords (self , text : str ) -> list :
6063 # Tokenize and remove stopwords and punctuation
6164 import nltk as nl
65+
6266 tokens = nl .toknize .word_tokenize (text )
63- tokens = [token .lower () for token in tokens if token not in nl .corpus .stopwords .words ('english' ) and token not in string .punctuation ]
67+ tokens = [
68+ token .lower ()
69+ for token in tokens
70+ if token not in nl .corpus .stopwords .words ("english" )
71+ and token not in string .punctuation
72+ ]
6473
6574 # Calculate frequency distribution
6675 freq_dist = Counter (tokens )
@@ -71,18 +80,25 @@ def chunk_with_topics(self, text: str) -> list:
7180 # Segment the text into topics
7281 segments = self .chunk (text )
7382 # Extract keywords for each topic segment
74- segments_with_topics = [(segment , self .extract_keywords (segment )) for segment in segments ]
83+ segments_with_topics = [
84+ (segment , self .extract_keywords (segment )) for segment in segments
85+ ]
7586 return segments_with_topics
76-
87+
88+
7789# Fixed-length word chunks
7890class FixedLengthWordChunking (ChunkingStrategy ):
7991 def __init__ (self , chunk_size = 100 , ** kwargs ):
8092 self .chunk_size = chunk_size
8193
8294 def chunk (self , text : str ) -> list :
8395 words = text .split ()
84- return [' ' .join (words [i :i + self .chunk_size ]) for i in range (0 , len (words ), self .chunk_size )]
85-
96+ return [
97+ " " .join (words [i : i + self .chunk_size ])
98+ for i in range (0 , len (words ), self .chunk_size )
99+ ]
100+
101+
86102# Sliding window chunking
87103class SlidingWindowChunking (ChunkingStrategy ):
88104 def __init__ (self , window_size = 100 , step = 50 , ** kwargs ):
@@ -93,7 +109,5 @@ def chunk(self, text: str) -> list:
93109 words = text .split ()
94110 chunks = []
95111 for i in range (0 , len (words ), self .step ):
96- chunks .append (' ' .join (words [i : i + self .window_size ]))
112+ chunks .append (" " .join (words [i : i + self .window_size ]))
97113 return chunks
98-
99-
0 commit comments