1- from glob import glob
1+ # This script operates directly on the "annotation" folder output by exporting a WebAnno project
2+ # for SOCC, this folder is SOCC\annotated\Appraisal\curation
3+ # Each of \annotation's sub-folders contains a TSV that contains the annotations for the given comment.
4+ # This script puts all of those TSVs into one long file, appending one after the other. In that file,
5+ # commented lines using '#' indicate when the source TSVs begin and end.
6+
7+ import os
28from smart_open import smart_open
39import re
410
5- # directory with TSVs
6- projectpath = input ('Path to project folder: (e.g. C:/ .../ curation)' )
11+ # path to a folder containing only the WebAnno TSVs
12+ projectpath = input ('Path to project folder: (e.g. C:\\ ...\\ curation)' )
713# directory to output
8- outputpath = input ("Path to folder to write new TSV to: (e.g. 'C:/.../newfile.tsv')" )
9-
10-
11- def getcontents (directory ):
12- """
13- Returns the file paths for all files in the specified path (directory). Identical to glob.glob() except that it
14- converts '\\ ' to '/'
15- """
16- return [name .replace ('\\ ' , '/' ) for name in glob (directory + '/*' )]
14+ outputpath = input ("Path to write new TSV to: (e.g. 'C:\\ ...\\ newfile.tsv')" )
1715
16+ # get the subfolders of /curation
17+ folders = os .listdir (projectpath )
18+ # since all TSVs should be named CURATION_USER.tsv, we need to record the folder name to know which comment is being annotated.
19+ # I use an embedded list for this.
20+ files = [[f , os .listdir (os .path .join (projectpath , f ))] for f in folders ]
21+ # so for each file 'f' in files, f[0] is the folder that f is contained in, and f[1] is the name of f
22+ # check that each folder contains exactly one CURATION_USER.tsv file
23+ if any ([len (f [1 ]) for f in files ]) > 1 :
24+ bad_folders = [f [0 ] for f in files if len (f [1 ]) > 1 ]
25+ raise Exception ('Some folders have more than one file:' , bad_folders )
26+ else :
27+ # since they have exactly one entry each, there's no point in keeping the filename in a list
28+ files = [[f [0 ], f [1 ][0 ]] for f in files ]
29+ # check that that file is CURATION_USER.tsv
30+ if any ([f [1 ] != 'CURATION_USER.tsv' for f in files ]):
31+ bad_names = [f [1 ] for f in files if f [1 ] != 'CURATION_USER.tsv' ]
32+ raise Exception ('Expected files named CURATION_USER.tsv; unexpected file names found:' , bad_names )
33+ for f in files :
34+ if f != 'CURATION_USER.tsv' :
35+ print (f )
36+ else :
37+ print ('Found curated annotations' )
1838
19- folders = getcontents (projectpath )
20- files = [getcontents (doc )[0 ] for doc in folders ]
21-
39+ # start combining the files
40+ verbose = False # setting this to True may help troubleshooting
2241newfile = ''
23- names = []
24- for file in files :
25- not_name = re .search (r'.*/' , file ).group ()
26- endlength = len (file [len (not_name ):]) + 1
27- beginlength = len (re .search (r'.*/' , file [:- endlength ]).group ())
28- name = file [beginlength :- endlength ]
29- names .append (name )
42+ for f in files :
43+ name = f [0 ]
44+ f_path = os .path .join (projectpath , f [0 ], f [1 ])
45+ # indicate the beginning and end of a comment, and what that comment's name is
3046 newfile = newfile + '#comment: ' + name + '\n '
31- with smart_open (file , 'r' ) as f :
32- newfile = newfile + f .read () + '#end of comment\n \n '
33- print ('processed' , name )
47+ with smart_open (f_path , 'r' , encoding = 'utf-8' ) as f_io :
48+ newfile = newfile + f_io .read () + '#end of comment\n \n '
49+ if verbose :
50+ print ('processed' , name )
3451
3552# output
36- if outputpath :
37- with smart_open (outputpath , 'w' ) as output :
53+ print ( 'All files processed, writing to' , outputpath )
54+ with smart_open (outputpath , 'w' ) as output :
3855 output .write (newfile )
39- print ('Combined everything!' )
40- else :
41- print ("Didn't write anything. No output path was given." )
56+ print ('Finished writing.' )
0 commit comments