Skip to content

Commit 5501301

Browse files
authored
Add files via upload
Updated for better commenting and some ability to check that it's being used correctly
1 parent 62922cf commit 5501301

File tree

1 file changed

+45
-30
lines changed

1 file changed

+45
-30
lines changed

scripts/combine_webanno.py

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,56 @@
1-
from glob import glob
1+
# This script operates directly on the "annotation" folder output by exporting a WebAnno project
2+
# for SOCC, this folder is SOCC\annotated\Appraisal\curation
3+
# Each of \annotation's sub-folders contains a TSV that contains the annotations for the given comment.
4+
# This script puts all of those TSVs into one long file, appending one after the other. In that file,
5+
# commented lines using '#' indicate when the source TSVs begin and end.
6+
7+
import os
28
from smart_open import smart_open
39
import re
410

5-
# directory with TSVs
6-
projectpath = input('Path to project folder: (e.g. C:/.../curation)')
11+
# path to a folder containing only the WebAnno TSVs
12+
projectpath = input('Path to project folder: (e.g. C:\\...\\curation)')
713
# directory to output
8-
outputpath = input("Path to folder to write new TSV to: (e.g. 'C:/.../newfile.tsv')")
9-
10-
11-
def getcontents(directory):
12-
"""
13-
Returns the file paths for all files in the specified path (directory). Identical to glob.glob() except that it
14-
converts '\\' to '/'
15-
"""
16-
return [name.replace('\\', '/') for name in glob(directory + '/*')]
14+
outputpath = input("Path to write new TSV to: (e.g. 'C:\\...\\newfile.tsv')")
1715

16+
# get the subfolders of /curation
17+
folders = os.listdir(projectpath)
18+
# since all TSVs should be named CURATION_USER.tsv, we need to record the folder name to know which comment is being annotated.
19+
# I use an embedded list for this.
20+
files = [[f, os.listdir(os.path.join(projectpath, f))] for f in folders]
21+
# so for each file 'f' in files, f[0] is the folder that f is contained in, and f[1] is the name of f
22+
# check that each folder contains exactly one CURATION_USER.tsv file
23+
if any([len(f[1]) for f in files]) > 1:
24+
bad_folders = [f[0] for f in files if len(f[1]) > 1]
25+
raise Exception('Some folders have more than one file:', bad_folders)
26+
else:
27+
# since they have exactly one entry each, there's no point in keeping the filename in a list
28+
files = [[f[0], f[1][0]] for f in files]
29+
# check that that file is CURATION_USER.tsv
30+
if any([f[1] != 'CURATION_USER.tsv' for f in files]):
31+
bad_names = [f[1] for f in files if f[1] != 'CURATION_USER.tsv']
32+
raise Exception('Expected files named CURATION_USER.tsv; unexpected file names found:', bad_names)
33+
for f in files:
34+
if f != 'CURATION_USER.tsv':
35+
print(f)
36+
else:
37+
print('Found curated annotations')
1838

19-
folders = getcontents(projectpath)
20-
files = [getcontents(doc)[0] for doc in folders]
21-
39+
# start combining the files
40+
verbose = False # setting this to True may help troubleshooting
2241
newfile = ''
23-
names = []
24-
for file in files:
25-
not_name = re.search(r'.*/', file).group()
26-
endlength = len(file[len(not_name):]) + 1
27-
beginlength = len(re.search(r'.*/', file[:-endlength]).group())
28-
name = file[beginlength:-endlength]
29-
names.append(name)
42+
for f in files:
43+
name = f[0]
44+
f_path = os.path.join(projectpath, f[0], f[1])
45+
# indicate the beginning and end of a comment, and what that comment's name is
3046
newfile = newfile + '#comment: ' + name + '\n'
31-
with smart_open(file, 'r') as f:
32-
newfile = newfile + f.read() + '#end of comment\n\n'
33-
print('processed', name)
47+
with smart_open(f_path, 'r', encoding='utf-8') as f_io:
48+
newfile = newfile + f_io.read() + '#end of comment\n\n'
49+
if verbose:
50+
print('processed', name)
3451

3552
# output
36-
if outputpath:
37-
with smart_open(outputpath, 'w') as output:
53+
print('All files processed, writing to', outputpath)
54+
with smart_open(outputpath, 'w') as output:
3855
output.write(newfile)
39-
print('Combined everything!')
40-
else:
41-
print("Didn't write anything. No output path was given.")
56+
print('Finished writing.')

0 commit comments

Comments
 (0)