Skip to content

Commit b894576

Browse files
committed
Fix header seperation in Pathology
1 parent a17a617 commit b894576

File tree

2 files changed

+48
-45
lines changed

2 files changed

+48
-45
lines changed

src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -267,40 +267,42 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except
267267
sb.append("\n");
268268
iLine++;
269269
}
270-
// make header
271-
text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
270+
271+
// make header
272+
text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
272273

273-
// content starts from here until meet Pathologist (beginning of a sentence)
274-
sb = new StringBuilder();
275-
if(iLine < allLines - 1) {
276-
iLine++;
277-
// text += lines[iLine].trim() + "\n";
278-
sb.append(lines[iLine].trim());
279-
sb.append("\n");
280-
}
281-
else {
282-
iLine = -1;
283-
}
284-
iLine++;
274+
// // content starts from here until meet Pathologist (beginning of a sentence)
275+
// sb = new StringBuilder();
276+
// if(iLine < allLines - 1) {
277+
// iLine++;
278+
// // text += lines[iLine].trim() + "\n";
279+
// sb.append(lines[iLine].trim());
280+
// sb.append("\n");
281+
// }
282+
// else {
283+
// iLine = -1;
284+
// }
285+
// iLine++;
285286

286-
Pattern p = Pattern.compile("^\\s*Pathologist");
287-
Matcher m = p.matcher(lines[iLine]);
288-
while(iLine < allLines && !m.find()) {
289-
if(!lines[iLine].trim().equals("")) {
290-
// text += lines[iLine].trim() + "\n";
291-
sb.append(lines[iLine].trim());
292-
sb.append("\n");
293-
}
294-
iLine++;
295-
m = p.matcher(lines[iLine]);
296-
}
297-
// skip until meet GROSS DESCRIPTION
298-
p = Pattern.compile("^GROSS DESCRIPTION");
299-
m = p.matcher(lines[iLine]);
300-
while(iLine < allLines && !m.find()) {
301-
iLine++;
302-
m = p.matcher(lines[iLine]);
303-
}
287+
// Pattern p = Pattern.compile("^\\s*Pathologist");
288+
// Matcher m = p.matcher(lines[iLine]);
289+
// while(iLine < allLines && !m.find()) {
290+
// if(!lines[iLine].trim().equals("")) {
291+
// // text += lines[iLine].trim() + "\n";
292+
// sb.append(lines[iLine].trim());
293+
// sb.append("\n");
294+
// }
295+
// iLine++;
296+
// m = p.matcher(lines[iLine]);
297+
// }
298+
// // skip until meet GROSS DESCRIPTION
299+
// p = Pattern.compile("^GROSS DESCRIPTION");
300+
// m = p.matcher(lines[iLine]);
301+
// while(iLine < allLines && !m.find()) {
302+
// iLine++;
303+
// m = p.matcher(lines[iLine]);
304+
// }
305+
304306
// keep contain until meet E_O_R
305307
while(iLine < allLines && lines[iLine].indexOf("E_O_R") == -1) {
306308
if(!lines[iLine].trim().equals("")) {
@@ -310,22 +312,23 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except
310312
}
311313
iLine++;
312314
}
313-
// make content
315+
316+
// make content
314317
// remove stop word
315-
// and **ID-NUM
316-
text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
318+
// and **ID-NUM
319+
text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
317320
text[1] = text[1].replaceAll("\\*\\*ID\\-NUM", "");
318321
text[1] = text[1].replaceAll("\\*\\*INITIALS", "");
319322
text[1] = text[1].replaceAll("_{3,}", "");
320323

321-
// footer starts from here to the end
322-
sb = new StringBuilder();
323-
while(iLine < allLines) {
324-
sb.append(lines[iLine++].trim());
325-
sb.append("\n");
326-
}
327-
// make the footer
328-
text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
324+
// footer starts from here to the end
325+
sb = new StringBuilder();
326+
while(iLine < allLines) {
327+
sb.append(lines[iLine++].trim());
328+
sb.append("\n");
329+
}
330+
// make the footer
331+
text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
329332

330333
return text;
331334
}

src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -981,13 +981,13 @@ public String wordTreeSkippedNGramPatternString(Map<String, String> spanMap)
981981
patternStr = sb.toString().trim().replaceAll(whiteSpaceBeforePunc, "\\\\s{0,1}");
982982
// in case the first skipped n-gram is a punctuation
983983
// there would be no white space before the n-gram
984-
patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\s*");
984+
patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\W*");
985985
// // quote the string
986986
// patternStr = TextUtil.escapeRegex(patternStr);
987987
// reverse 's
988988
patternStr = patternStr.replaceAll("'s", "' {0,1}s");
989989
// replace whitespace by \s
990-
patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\s+");
990+
patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\W+");
991991

992992
// System.out.println("Search pattern: " + patternStr);
993993

0 commit comments

Comments
 (0)