Skip to content

Commit 98bc31c

Browse files
committed
Merged from master version 2.0
1 parent 62ac3ed commit 98bc31c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+35427
-2275
lines changed

.gitignore

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@ profile_wosserver/
2020
metaknowledgeDocs.md
2121
j9Raws/
2222
/manualj9Abbreviations*
23-
/j9Abbreviations*
24-
notebooks/
25-
vagrant/
23+
metaknowledge/WOS/journalAbbreviations/j9Abbreviations.bak
24+
metaknowledge/WOS/journalAbbreviations/j9Abbreviations.dir
25+
metaknowledge/WOS/journalAbbreviations/j9Abbreviations.dat
26+
!savedrecs.txt
27+
*.bib
2628

2729
# Byte-compiled / optimized / DLL files
2830
__pycache__/

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
#metaknowledge
1+
# metaknowledge
22

3-
metaknowledge is a Python3 library that simplifies bibliometric research using Web of Science data. It reads a directory of plain text files containing meta-data on publications and citations, and writes to a variety of data structures that are suitable for quantitative, network, and text analyses. It handles large datasets (e.g. several million records) efficiently.
3+
_metaknowledge_ is a Python3 library that simplifies bibliometric research using data from various sources. It reads a directory of plain text files containing meta-data on publications and citations, and writes to a variety of data structures that are suitable for quantitative, network, and text analyses. It handles large datasets (e.g. several million records) efficiently.
44

55
The website can be found at [networkslab.org/metaknowledge](http://networkslab.org/metaknowledge/).
66

7-
##Installing
7+
# Major Update
8+
The version of _metaknowledge_ currently available is the second major release and is not backwards compatible with the previous versions. If you wish to use an older version they are still available on github or pip.
9+
10+
## Installing
811
To install run `python3 setup.py install`
912

1013
For information on alternate installs read the documentation at the [website](http://networkslab.org/metaknowledge/installation/).

metaknowledge/ProQuest/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""
2+
"""
3+
from .recordProQuest import *
4+
from .proQuestHandlers import *
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from ..mkExceptions import BadProQuestFile
2+
3+
from .recordProQuest import ProQuestRecord
4+
5+
def isProQuestFile(infile, checkedLines = 2):
6+
try:
7+
with open(infile, 'r', encoding='utf-8') as openfile:
8+
f = enumerate(openfile, start = 0)
9+
for i in range(checkedLines):
10+
#This seems like enough checking
11+
#The next line is the date meaning it is not constant
12+
#More checking could be done
13+
if f.__next__()[1] == "_" * 60 + "\n" and f.__next__()[1] == '\n' and f.__next__()[1] == 'Report Information from ProQuest\n':
14+
return True
15+
except (StopIteration, UnicodeDecodeError):
16+
return False
17+
else:
18+
return False
19+
20+
def proQuestParser(proFile):
21+
#assumes the file is ProQuest
22+
nameDict = {}
23+
recSet = set()
24+
error = None
25+
lineNum = 0
26+
try:
27+
with open(proFile, 'r', encoding = 'utf-8') as openfile:
28+
f = enumerate(openfile, start = 1)
29+
for i in range(12):
30+
lineNum, line = next(f)
31+
# f is file so it *should* end, or at least cause a parser error eventually
32+
while True:
33+
lineNum, line = next(f)
34+
lineNum, line = next(f)
35+
if line == 'Bibliography\n':
36+
for i in range(3):
37+
lineNum, line = next(f)
38+
break
39+
else:
40+
s = line.split('. ')
41+
nameDict[int(s[0])] = '. '.join(s[1:])[:-1]
42+
while True:
43+
#import pdb; pdb.set_trace()
44+
lineNum, line = next(f)
45+
if line == 'Bibliography\n':
46+
break
47+
elif line.startswith('Document '):
48+
n = int(line[9:].split(' of ')[0])
49+
R = ProQuestRecord(f, sFile = proFile, sLine = lineNum)
50+
if R.get('Title') != nameDict[n]:
51+
error = BadProQuestFile("The numbering of the titles at the beginning of the file does not match the records inside. Line {} has a record titled '{}' with number {}, the name should be '{}'.".format(lineNum, R.get('Title', "TITLE MISSING"), n, nameDict[n]))
52+
raise StopIteration
53+
recSet.add(R)
54+
lineNum, line = next(f)
55+
else:
56+
#Parsing failed
57+
error = BadProQuestFile("The file '{}' has parts of it that are unparsable starting at line: {}. It is likely that the seperators between the records are incorrect".format(proFile, lineNum))
58+
raise StopIteration
59+
except (UnicodeDecodeError, StopIteration, ValueError) as e:
60+
if error is None:
61+
error = BadProQuestFile("The file '{}' has parts of it that are unparsable starting at line: {}.\nThe error was: '{}'".format(proFile, lineNum, e))
62+
return recSet, error
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import collections
2+
import io
3+
import itertools
4+
5+
from ..mkExceptions import BadProQuestRecord, RecordsNotCompatible
6+
from ..mkRecord import ExtendedRecord
7+
8+
from .tagProcessing.specialFunctions import proQuestSpecialTagToFunc
9+
from .tagProcessing.tagFunctions import proQuestTagToFunc
10+
11+
class ProQuestRecord(ExtendedRecord):
12+
def __init__(self, inRecord, recNum = None, sFile = "", sLine = 0):
13+
bad = False
14+
error = None
15+
fieldDict = None
16+
try:
17+
if isinstance(inRecord, dict) or isinstance(inRecord, collections.OrderedDict):
18+
fieldDict = collections.OrderedDict(inRecord)
19+
elif isinstance(inRecord, enumerate) or isinstance(inRecord, itertools.chain):
20+
#Already enumerated
21+
#itertools.chain is for the parser upstream to insert stuff into the stream
22+
fieldDict = proQuestRecordParser(inRecord, recNum)
23+
elif isinstance(inRecord, io.IOBase):
24+
fieldDict = proQuestRecordParser(enumerate(inRecord), recNum)
25+
elif isinstance(inRecord, str):
26+
#Probaly a better way to do this but it isn't going to be used much, so no need to improve it
27+
def addCharToEnd(lst):
28+
for s in lst:
29+
yield s + '\n'
30+
fieldDict = proQuestRecordParser(enumerate(addCharToEnd(inRecord.split('\n')), start = 1), recNum)
31+
#string io
32+
else:
33+
raise TypeError("Unsupported input type '{}', ProQuestRecords cannot be created from '{}'".format(inRecord, type(inRecord)))
34+
except BadProQuestRecord as b:
35+
self.bad = True
36+
self.error = b
37+
fieldDict = collections.OrderedDict()
38+
try:
39+
self._proID = "PROQUEST:{}".format(fieldDict["ProQuest document ID"][0])
40+
except KeyError:
41+
self._proID = "PROQUEST:MISSING"
42+
bad = True
43+
error = BadProQuestRecord("Missing ProQuest document ID")
44+
ExtendedRecord.__init__(self, fieldDict, self._proID, bad, error, sFile =sFile, sLine = sLine)
45+
46+
def encoding(self):
47+
return 'utf-8'
48+
49+
@staticmethod
50+
def getAltName(tag):
51+
return None
52+
53+
@staticmethod
54+
def tagProcessingFunc(tag):
55+
#Should not raise an exception
56+
#It might be faster to do this as a class attribute
57+
return proQuestTagToFunc(tag)
58+
59+
def specialFuncs(self, key):
60+
return proQuestSpecialTagToFunc[key](self)
61+
#raise KeyError("There are no special functions given by default.")
62+
63+
def writeRecord(self, infile):
64+
raise RecordsNotCompatible("ProQuest's data format cannot be written back to file. You can still write out a csv with writeCSV().")
65+
66+
def proQuestRecordParser(enRecordFile, recNum):
67+
tagDict = collections.OrderedDict()
68+
currentEntry = 'Name'
69+
while True:
70+
lineNum, line = next(enRecordFile)
71+
if line == '_' * 60 + '\n':
72+
break
73+
elif line == '\n':
74+
pass
75+
elif currentEntry is 'Name' or currentEntry is 'url':
76+
tagDict[currentEntry] = [line.rstrip()]
77+
currentEntry = None
78+
elif ':' in line and not line.startswith('http://'):
79+
splitLine = line.split(': ')
80+
currentEntry = splitLine[0]
81+
tagDict[currentEntry] = [': '.join(splitLine[1:]).rstrip()]
82+
if currentEntry == 'Author':
83+
currentEntry = 'url'
84+
else:
85+
tagDict[currentEntry].append(line.rstrip())
86+
return tagDict
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
| [`'CL'`]({{ site.baseurl }}{{ page.url }}#confLocation) | [confLocation]({{ site.baseurl }}{{ page.url }}#confLocation) |
4242
| [`'SP'`]({{ site.baseurl }}{{ page.url }}#confSponsors) | [confSponsors]({{ site.baseurl }}{{ page.url }}#confSponsors) |
4343
| [`'DE'`]({{ site.baseurl }}{{ page.url }}#authKeyWords) | [authKeyWords]({{ site.baseurl }}{{ page.url }}#authKeyWords) |
44-
| [`'ID'`]({{ site.baseurl }}{{ page.url }}#keyWords) | [keyWords]({{ site.baseurl }}{{ page.url }}#keyWords) |
44+
| [`'ID'`]({{ site.baseurl }}{{ page.url }}#keywords) | [keywords]({{ site.baseurl }}{{ page.url }}#keywords) |
4545
| [`'AB'`]({{ site.baseurl }}{{ page.url }}#abstract) | [abstract]({{ site.baseurl }}{{ page.url }}#abstract) |
4646
| [`'C1'`]({{ site.baseurl }}{{ page.url }}#authAddress) | [authAddress]({{ site.baseurl }}{{ page.url }}#authAddress) |
4747
| [`'RP'`]({{ site.baseurl }}{{ page.url }}#reprintAddress) | [reprintAddress]({{ site.baseurl }}{{ page.url }}#reprintAddress) |
@@ -83,5 +83,6 @@
8383
| [`'PM'`]({{ site.baseurl }}{{ page.url }}#pubMedID) | [pubMedID]({{ site.baseurl }}{{ page.url }}#pubMedID) |
8484
"""
8585

86-
from .tagFunctions import *
87-
from .funcDicts import tagToFullDict, fullToTagDict, tagNameConverterDict, tagsAndNameSet, knownTagsList
86+
from .tagProcessing.tagFunctions import *
87+
from .tagProcessing.funcDicts import tagToFullDict, fullToTagDict, tagNameConverterDict, tagsAndNameSet, knownTagsList
88+
from .journalAbbreviations.backend import updatej9DB, getj9dict, abrevDBname, excludeFromDB, addToDB, manaulDBname

metaknowledge/journalAbbreviations/__init__.py renamed to metaknowledge/WOS/journalAbbreviations/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,3 @@
55
66
The other functions of the module are for manually adding and removing abbreviations from the database. It is recommended that this be done with the command-line tool `metaknowledge`, unless you know what you are doing.
77
"""
8-
9-
from .backend import updatej9DB, getj9dict, abrevDBname, excludeFromDB, addToDB, manaulDBname

metaknowledge/journalAbbreviations/backend.py renamed to metaknowledge/WOS/journalAbbreviations/backend.py

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import datetime
66
import dbm.dumb
77

8+
from ...mkExceptions import JournalDataBaseError
9+
810
abrevDBname = "j9Abbreviations"
911

1012
manaulDBname = "manualj9Abbreviations"
@@ -108,19 +110,22 @@ def updatej9DB(dbname = abrevDBname, saveRawHTML = False):
108110
if not os.path.isdir(rawDir):
109111
os.mkdir(rawDir)
110112
_j9SaveCurrent(sDir = rawDir)
111-
dbLoc = os.path.normpath(os.path.dirname(__file__) + '/{}'.format(dbname))
112-
with dbm.dumb.open(dbLoc, flag = 'c') as db:
113-
try:
114-
j9Dict = _getCurrentj9Dict()
115-
except urllib.error.URLError:
116-
raise urllib.error.URLError("Unable to access server, check your connection")
117-
for k, v in j9Dict.items():
118-
if k in db:
119-
for jName in v:
120-
if jName not in j9Dict[k]:
121-
j9Dict[k] += '|' + jName
122-
else:
123-
db[k] = '|'.join(v)
113+
dbLoc = os.path.join(os.path.normpath(os.path.dirname(__file__)), dbname)
114+
try:
115+
with dbm.dumb.open(dbLoc, flag = 'c') as db:
116+
try:
117+
j9Dict = _getCurrentj9Dict()
118+
except urllib.error.URLError:
119+
raise urllib.error.URLError("Unable to access server, check your connection")
120+
for k, v in j9Dict.items():
121+
if k in db:
122+
for jName in v:
123+
if jName not in j9Dict[k]:
124+
j9Dict[k] += '|' + jName
125+
else:
126+
db[k] = '|'.join(v)
127+
except dbm.dumb.error as e:
128+
raise mkException("Something happened with the database of WOS journal names. To fix this you should delete the 1 to 3 files whose names start with {}. If this doesn't work (sorry), deleteing everything in '{}' and reinstalling metaknowledge should.\nThe error was '{}'".format(dbLoc, os.path.dirname(__file__), e))
124129

125130
def getj9dict(dbname = abrevDBname, manualDB = manaulDBname, returnDict = 'both'):
126131
"""Returns the dictionary of journal abbreviations mapping to a list of the associated journal names. By default the local database is used. The database is in the file _dbname_ in the same directory as this source file
@@ -142,21 +147,28 @@ def getj9dict(dbname = abrevDBname, manualDB = manaulDBname, returnDict = 'both'
142147
dbLoc = os.path.normpath(os.path.dirname(__file__))
143148

144149
retDict = {}
145-
146-
if returnDict == 'both' or returnDict == 'WOS':
147-
with dbm.dumb.open(dbLoc + '/{}'.format(dbname)) as db:
148-
if len(db) == 0:
149-
raise RuntimeError("J9 Database empty or missing, to regenerate it run metaknowledge.journalAbbreviations.updatej9DB().")
150-
for k, v in db.items():
151-
retDict[k.decode('utf-8')] = v.decode('utf-8').split('|')
152-
if returnDict == 'both' or returnDict == 'manual':
153-
if os.path.isfile(dbLoc + '/{}.dat'.format(manualDB)):
154-
with dbm.dumb.open(dbLoc + '/{}'.format(manualDB)) as db:
150+
try:
151+
if returnDict == 'both' or returnDict == 'WOS':
152+
with dbm.dumb.open(dbLoc + '/{}'.format(dbname)) as db:
153+
if len(db) == 0:
154+
raise JournalDataBaseError("J9 Database empty or missing, to regenerate it import and run metaknowledge.WOS.journalAbbreviations.updatej9DB().")
155155
for k, v in db.items():
156156
retDict[k.decode('utf-8')] = v.decode('utf-8').split('|')
157-
else:
158-
if returnDict == 'manual':
159-
raise RuntimeError("Manual J9 Database ({0}) missing, to create it run addToDB(dbname = {0})".format(manualDB))
157+
except JournalDataBaseError:
158+
updatej9DB()
159+
return getj9dict(dbname = dbname, manualDB = manualDB, returnDict = returnDict)
160+
try:
161+
if returnDict == 'both' or returnDict == 'manual':
162+
if os.path.isfile(dbLoc + '/{}.dat'.format(manualDB)):
163+
with dbm.dumb.open(dbLoc + '/{}'.format(manualDB)) as db:
164+
for k, v in db.items():
165+
retDict[k.decode('utf-8')] = v.decode('utf-8').split('|')
166+
else:
167+
if returnDict == 'manual':
168+
raise JournalDataBaseError("Manual J9 Database ({0}) missing, to create it run addToDB(dbname = {0})".format(manualDB))
169+
except JournalDataBaseError:
170+
updatej9DB(dbname = manualDB)
171+
return getj9dict(dbname = dbname, manualDB = manualDB, returnDict = returnDict)
160172
return retDict
161173

162174
def addToDB(abbr = None, dbname = manaulDBname):

0 commit comments

Comments
 (0)