@@ -47,9 +47,14 @@ public Map<String, Object> getWordTree(List<String> reportIDList, String rootWor
4747 String reportID , reportText ;
4848 List <Map <String , Object >> leftList = new ArrayList <>();
4949 List <Map <String , Object >> rightList = new ArrayList <>();
50-
50+
51+ String searchText = rootWord .replaceAll (" " , "\\ \\ s*" );
52+ // Pattern sentencePattern = Pattern.compile("([^.:]*?" + rootWord + "[^.\n]*\\.)");
53+ System .out .println ("New search is: " + searchText );
54+
55+ Pattern sentencePattern = Pattern .compile (" ([^.:]*?\\ b" + searchText + "\\ b[^\n .?!]*)" , Pattern .CASE_INSENSITIVE );
5156// Pattern sentencePattern = Pattern.compile("([^.:]*?" + rootWord + "[^.\n]*\\.)");
52- Pattern sentencePattern = Pattern . compile ( " ([^.:]*? \\ b" + rootWord + " \\ b[^ \n .?!]*)" , Pattern . CASE_INSENSITIVE );
57+
5358 Pattern tokenPattern = Pattern .compile ("[\\ w']+|[.,!?;]" );
5459
5560
@@ -68,7 +73,7 @@ public Map<String, Object> getWordTree(List<String> reportIDList, String rootWor
6873 reportText = TextUtil .reconstructSentences (reportText );
6974 int oldCount = matchCount ;
7075 matchCount = parseWordTree (reportText , sentencePattern ,
71- tokenPattern , leftList , rightList , reportID , rootWord ,
76+ tokenPattern , leftList , rightList , reportID , searchText ,
7277 matchCount );
7378// docCount++;
7479 // find within the pathology report
@@ -81,7 +86,7 @@ public Map<String, Object> getWordTree(List<String> reportIDList, String rootWor
8186 // use heuristic merging sentences
8287 reportText = TextUtil .reconstructSentences (reportText );
8388 matchCount = parseWordTree (reportText , sentencePattern ,
84- tokenPattern , leftList , rightList , reportID , rootWord ,
89+ tokenPattern , leftList , rightList , reportID , searchText ,
8590 matchCount );
8691// docCount++;
8792 }
@@ -138,11 +143,24 @@ protected int parseWordTree(String reportText, Pattern sentencePattern,
138143 // left branch
139144 tokenList = new ArrayList <>();
140145
146+ System .out .println (matchedSentence );
147+ Pattern pattern = Pattern .compile (rootWord );
148+ Matcher matcher = pattern .matcher (matchedSentence );
149+
150+ Integer start = 0 ;
151+ Integer end = 0 ;
152+
153+ if (matcher .find ()){
154+ start = matcher .start ();
155+ end = matcher .end ();
156+ }
157+
141158 branchMatch = tokenPattern .matcher (matchedSentence .substring (0 ,
142- matchedSentence . indexOf ( rootWord ) ).trim ());
159+ start ).trim ());
143160 while (branchMatch .find ()) {
144161 tokenList .add (branchMatch .group ());
145162 }
163+
146164 matchedItem = new HashMap <>();
147165 matchedItem .put ("doc" , reportID );
148166 matchedItem .put ("id" , Integer .toString (matchCount ));
@@ -152,8 +170,8 @@ protected int parseWordTree(String reportText, Pattern sentencePattern,
152170 // right branch
153171 tokenList = new ArrayList <>();
154172 branchMatch = tokenPattern .matcher (matchedSentence .substring (
155- matchedSentence . indexOf ( rootWord ) + rootWord . length ())
156- . trim ());
173+ end ). trim ());
174+
157175 while (branchMatch .find ()) {
158176 tokenList .add (branchMatch .group ());
159177 }
0 commit comments