UWNETLAB
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 3 deletions b/‎.gitignore‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 3 deletions b/‎README.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎metaknowledge/ProQuest/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎metaknowledge/ProQuest/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎metaknowledge/ProQuest/proQuestHandlers.py‎
Lines changed: 62 additions & 0 deletions b/‎metaknowledge/ProQuest/proQuestHandlers.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎metaknowledge/ProQuest/recordProQuest.py‎
Lines changed: 86 additions & 0 deletions b/‎metaknowledge/ProQuest/recordProQuest.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎metaknowledge/tagProcessing/__init__.py‎ ‎metaknowledge/WOS/__init__.py‎metaknowledge/tagProcessing/__init__.py renamed to metaknowledge/WOS/__init__.py
Lines changed: 4 additions & 3 deletions b/‎metaknowledge/tagProcessing/__init__.py‎ ‎metaknowledge/WOS/__init__.py‎metaknowledge/tagProcessing/__init__.py renamed to metaknowledge/WOS/__init__.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎…owledge/journalAbbreviations/__init__.py‎ ‎…dge/WOS/journalAbbreviations/__init__.py‎metaknowledge/journalAbbreviations/__init__.py renamed to metaknowledge/WOS/journalAbbreviations/__init__.py
Lines changed: 0 additions & 2 deletions b/‎…owledge/journalAbbreviations/__init__.py‎ ‎…dge/WOS/journalAbbreviations/__init__.py‎metaknowledge/journalAbbreviations/__init__.py renamed to metaknowledge/WOS/journalAbbreviations/__init__.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎…nowledge/journalAbbreviations/backend.py‎ ‎…edge/WOS/journalAbbreviations/backend.py‎metaknowledge/journalAbbreviations/backend.py renamed to metaknowledge/WOS/journalAbbreviations/backend.py
Lines changed: 38 additions & 26 deletions b/‎…nowledge/journalAbbreviations/backend.py‎ ‎…edge/WOS/journalAbbreviations/backend.py‎metaknowledge/journalAbbreviations/backend.py renamed to metaknowledge/WOS/journalAbbreviations/backend.py
Lines changed: 38 additions & 26 deletions
@@ -20,9 +20,11 @@ profile_wosserver/
 metaknowledgeDocs.md
 j9Raws/
 /manualj9Abbreviations*
-/j9Abbreviations*
-notebooks/
-vagrant/
+metaknowledge/WOS/journalAbbreviations/j9Abbreviations.bak
+metaknowledge/WOS/journalAbbreviations/j9Abbreviations.dir
+metaknowledge/WOS/journalAbbreviations/j9Abbreviations.dat
+!savedrecs.txt
+*.bib
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -1,10 +1,13 @@
-#metaknowledge
+# metaknowledge
 
-metaknowledge is a Python3 library that simplifies bibliometric research using Web of Science data. It reads a directory of plain text files containing meta-data on publications and citations, and writes to a variety of data structures that are suitable for quantitative, network, and text analyses. It handles large datasets (e.g. several million records) efficiently.
+_metaknowledge_ is a Python3 library that simplifies bibliometric research using data from various sources. It reads a directory of plain text files containing meta-data on publications and citations, and writes to a variety of data structures that are suitable for quantitative, network, and text analyses. It handles large datasets (e.g. several million records) efficiently.
 
 The website can be found at [networkslab.org/metaknowledge](http://networkslab.org/metaknowledge/).
 
-##Installing
+# Major Update
+The version of _metaknowledge_ currently available is the second major release and is not backwards compatible with the previous versions. If you wish to use an older version they are still available on github or pip.
+
+## Installing
 To install run `python3 setup.py install`
 
 For information on alternate installs read the documentation at the [website](http://networkslab.org/metaknowledge/installation/).
@@ -0,0 +1,4 @@
+"""
+"""
+from .recordProQuest import *
+from .proQuestHandlers import *
@@ -0,0 +1,62 @@
+from ..mkExceptions import BadProQuestFile
+
+from .recordProQuest import ProQuestRecord
+
+def isProQuestFile(infile, checkedLines = 2):
+    try:
+        with open(infile, 'r', encoding='utf-8') as openfile:
+            f = enumerate(openfile, start = 0)
+            for i in range(checkedLines):
+                #This seems like enough checking
+                #The next line is the date meaning it is not constant
+                #More checking could be done
+                if f.__next__()[1] == "_" * 60 + "\n" and f.__next__()[1] == '\n' and f.__next__()[1] == 'Report Information from ProQuest\n':
+                    return True
+    except (StopIteration, UnicodeDecodeError):
+        return False
+    else:
+        return False
+
+def proQuestParser(proFile):
+    #assumes the file is ProQuest
+    nameDict = {}
+    recSet = set()
+    error = None
+    lineNum = 0
+    try:
+        with open(proFile, 'r', encoding = 'utf-8') as openfile:
+            f = enumerate(openfile, start = 1)
+            for i in range(12):
+                lineNum, line = next(f)
+            # f is file so it *should* end, or at least cause a parser error eventually
+            while True:
+                lineNum, line = next(f)
+                lineNum, line = next(f)
+                if line == 'Bibliography\n':
+                    for i in range(3):
+                        lineNum, line = next(f)
+                    break
+                else:
+                    s = line.split('. ')
+                    nameDict[int(s[0])] = '. '.join(s[1:])[:-1]
+            while True:
+                #import pdb; pdb.set_trace()
+                lineNum, line = next(f)
+                if line == 'Bibliography\n':
+                    break
+                elif line.startswith('Document '):
+                    n = int(line[9:].split(' of ')[0])
+                    R = ProQuestRecord(f, sFile = proFile, sLine = lineNum)
+                    if R.get('Title') != nameDict[n]:
+                        error = BadProQuestFile("The numbering of the titles at the beginning of the file does not match the records inside. Line {} has a record titled '{}' with number {}, the name should be '{}'.".format(lineNum, R.get('Title', "TITLE MISSING"), n, nameDict[n]))
+                        raise StopIteration
+                    recSet.add(R)
+                    lineNum, line = next(f)
+                else:
+                    #Parsing failed
+                    error = BadProQuestFile("The file '{}' has parts of it that are unparsable starting at line: {}. It is likely that the seperators between the records are incorrect".format(proFile, lineNum))
+                    raise StopIteration
+    except (UnicodeDecodeError, StopIteration, ValueError) as e:
+        if error is None:
+            error = BadProQuestFile("The file '{}' has parts of it that are unparsable starting at line: {}.\nThe error was: '{}'".format(proFile, lineNum, e))
+    return recSet, error
@@ -0,0 +1,86 @@
+import collections
+import io
+import itertools
+
+from ..mkExceptions import BadProQuestRecord, RecordsNotCompatible
+from ..mkRecord import ExtendedRecord
+
+from .tagProcessing.specialFunctions import proQuestSpecialTagToFunc
+from .tagProcessing.tagFunctions import proQuestTagToFunc
+
+class ProQuestRecord(ExtendedRecord):
+    def __init__(self, inRecord, recNum = None, sFile = "", sLine = 0):
+        bad = False
+        error = None
+        fieldDict = None
+        try:
+            if isinstance(inRecord, dict) or isinstance(inRecord, collections.OrderedDict):
+                fieldDict = collections.OrderedDict(inRecord)
+            elif isinstance(inRecord, enumerate) or isinstance(inRecord, itertools.chain):
+                #Already enumerated
+                #itertools.chain is for the parser upstream to insert stuff into the stream
+                fieldDict = proQuestRecordParser(inRecord, recNum)
+            elif isinstance(inRecord, io.IOBase):
+                fieldDict = proQuestRecordParser(enumerate(inRecord), recNum)
+            elif isinstance(inRecord, str):
+                #Probaly a better way to do this but it isn't going to be used much, so no need to improve it
+                def addCharToEnd(lst):
+                    for s in lst:
+                        yield s + '\n'
+                fieldDict = proQuestRecordParser(enumerate(addCharToEnd(inRecord.split('\n')), start = 1), recNum)
+                #string io
+            else:
+                raise TypeError("Unsupported input type '{}', ProQuestRecords cannot be created from '{}'".format(inRecord, type(inRecord)))
+        except BadProQuestRecord as b:
+            self.bad = True
+            self.error = b
+            fieldDict = collections.OrderedDict()
+        try:
+            self._proID = "PROQUEST:{}".format(fieldDict["ProQuest document ID"][0])
+        except KeyError:
+            self._proID = "PROQUEST:MISSING"
+            bad = True
+            error = BadProQuestRecord("Missing ProQuest document ID")
+        ExtendedRecord.__init__(self, fieldDict, self._proID, bad, error, sFile =sFile, sLine = sLine)
+
+    def encoding(self):
+        return 'utf-8'
+
+    @staticmethod
+    def getAltName(tag):
+        return None
+
+    @staticmethod
+    def tagProcessingFunc(tag):
+        #Should not raise an exception
+        #It might be faster to do this as a class attribute
+        return proQuestTagToFunc(tag)
+
+    def specialFuncs(self, key):
+        return proQuestSpecialTagToFunc[key](self)
+        #raise KeyError("There are no special functions given by default.")
+
+    def writeRecord(self, infile):
+        raise RecordsNotCompatible("ProQuest's data format cannot be written back to file. You can still write out a csv with writeCSV().")
+
+def proQuestRecordParser(enRecordFile, recNum):
+    tagDict = collections.OrderedDict()
+    currentEntry = 'Name'
+    while True:
+        lineNum, line = next(enRecordFile)
+        if line == '_' * 60 + '\n':
+            break
+        elif line == '\n':
+            pass
+        elif currentEntry is 'Name' or currentEntry is 'url':
+            tagDict[currentEntry] = [line.rstrip()]
+            currentEntry = None
+        elif ':' in line and not line.startswith('http://'):
+            splitLine = line.split(': ')
+            currentEntry = splitLine[0]
+            tagDict[currentEntry] = [': '.join(splitLine[1:]).rstrip()]
+            if currentEntry == 'Author':
+                currentEntry = 'url'
+        else:
+            tagDict[currentEntry].append(line.rstrip())
+    return tagDict
@@ -41,7 +41,7 @@
 | [`'CL'`]({{ site.baseurl }}{{ page.url }}#confLocation) | [confLocation]({{ site.baseurl }}{{ page.url }}#confLocation) |
 | [`'SP'`]({{ site.baseurl }}{{ page.url }}#confSponsors) | [confSponsors]({{ site.baseurl }}{{ page.url }}#confSponsors) |
 | [`'DE'`]({{ site.baseurl }}{{ page.url }}#authKeyWords) | [authKeyWords]({{ site.baseurl }}{{ page.url }}#authKeyWords) |
-| [`'ID'`]({{ site.baseurl }}{{ page.url }}#keyWords) | [keyWords]({{ site.baseurl }}{{ page.url }}#keyWords) |
+| [`'ID'`]({{ site.baseurl }}{{ page.url }}#keywords) | [keywords]({{ site.baseurl }}{{ page.url }}#keywords) |
 | [`'AB'`]({{ site.baseurl }}{{ page.url }}#abstract) | [abstract]({{ site.baseurl }}{{ page.url }}#abstract) |
 | [`'C1'`]({{ site.baseurl }}{{ page.url }}#authAddress) | [authAddress]({{ site.baseurl }}{{ page.url }}#authAddress) |
 | [`'RP'`]({{ site.baseurl }}{{ page.url }}#reprintAddress) | [reprintAddress]({{ site.baseurl }}{{ page.url }}#reprintAddress) |
@@ -83,5 +83,6 @@
 | [`'PM'`]({{ site.baseurl }}{{ page.url }}#pubMedID) | [pubMedID]({{ site.baseurl }}{{ page.url }}#pubMedID) |
 """
 
-from .tagFunctions import *
-from .funcDicts import tagToFullDict, fullToTagDict, tagNameConverterDict, tagsAndNameSet, knownTagsList
+from .tagProcessing.tagFunctions import *
+from .tagProcessing.funcDicts import tagToFullDict, fullToTagDict, tagNameConverterDict, tagsAndNameSet, knownTagsList
+from .journalAbbreviations.backend import updatej9DB, getj9dict, abrevDBname, excludeFromDB, addToDB, manaulDBname
@@ -5,5 +5,3 @@
 
 The other functions of the module are for manually adding and removing abbreviations from the database. It is recommended that this be done with the command-line tool `metaknowledge`, unless you know what you are doing.
 """
-
-from .backend import updatej9DB, getj9dict, abrevDBname, excludeFromDB, addToDB, manaulDBname
@@ -5,6 +5,8 @@
 import datetime
 import dbm.dumb
 
+from ...mkExceptions import JournalDataBaseError
+
 abrevDBname = "j9Abbreviations"
 
 manaulDBname = "manualj9Abbreviations"
@@ -108,19 +110,22 @@ def updatej9DB(dbname = abrevDBname, saveRawHTML = False):
         if not os.path.isdir(rawDir):
             os.mkdir(rawDir)
         _j9SaveCurrent(sDir = rawDir)
-    dbLoc = os.path.normpath(os.path.dirname(__file__) + '/{}'.format(dbname))
-    with dbm.dumb.open(dbLoc, flag = 'c') as db:
-        try:
-            j9Dict = _getCurrentj9Dict()
-        except urllib.error.URLError:
-            raise urllib.error.URLError("Unable to access server, check your connection")
-        for k, v in j9Dict.items():
-            if k in db:
-                for jName in v:
-                    if jName not in j9Dict[k]:
-                        j9Dict[k] += '|' + jName
-            else:
-                db[k] = '|'.join(v)
+    dbLoc = os.path.join(os.path.normpath(os.path.dirname(__file__)), dbname)
+    try:
+        with dbm.dumb.open(dbLoc, flag = 'c') as db:
+            try:
+                j9Dict = _getCurrentj9Dict()
+            except urllib.error.URLError:
+                raise urllib.error.URLError("Unable to access server, check your connection")
+            for k, v in j9Dict.items():
+                if k in db:
+                    for jName in v:
+                        if jName not in j9Dict[k]:
+                            j9Dict[k] += '|' + jName
+                else:
+                    db[k] = '|'.join(v)
+    except dbm.dumb.error as e:
+        raise mkException("Something happened with the database of WOS journal names. To fix this you should delete the 1 to 3 files whose names start with {}. If this doesn't work (sorry), deleteing everything in '{}' and reinstalling metaknowledge should.\nThe error was '{}'".format(dbLoc, os.path.dirname(__file__), e))
 
 def getj9dict(dbname = abrevDBname, manualDB = manaulDBname, returnDict = 'both'):
     """Returns the dictionary of journal abbreviations mapping to a list of the associated journal names. By default the local database is used. The database is in the file _dbname_ in the same directory as this source file
@@ -142,21 +147,28 @@ def getj9dict(dbname = abrevDBname, manualDB = manaulDBname, returnDict = 'both'
     dbLoc = os.path.normpath(os.path.dirname(__file__))
 
     retDict = {}
-
-    if returnDict == 'both' or returnDict == 'WOS':
-        with dbm.dumb.open(dbLoc + '/{}'.format(dbname)) as db:
-            if len(db) == 0:
-                raise RuntimeError("J9 Database empty or missing, to regenerate it run metaknowledge.journalAbbreviations.updatej9DB().")
-            for k, v in db.items():
-                retDict[k.decode('utf-8')] = v.decode('utf-8').split('|')
-    if returnDict == 'both' or returnDict == 'manual':
-        if os.path.isfile(dbLoc + '/{}.dat'.format(manualDB)):
-            with dbm.dumb.open(dbLoc + '/{}'.format(manualDB)) as db:
+    try:
+        if returnDict == 'both' or returnDict == 'WOS':
+            with dbm.dumb.open(dbLoc + '/{}'.format(dbname)) as db:
+                if len(db) == 0:
+                    raise JournalDataBaseError("J9 Database empty or missing, to regenerate it import and run metaknowledge.WOS.journalAbbreviations.updatej9DB().")
                 for k, v in db.items():
                     retDict[k.decode('utf-8')] = v.decode('utf-8').split('|')
-        else:
-            if returnDict == 'manual':
-                raise RuntimeError("Manual J9 Database ({0}) missing, to create it run addToDB(dbname = {0})".format(manualDB))
+    except JournalDataBaseError:
+        updatej9DB()
+        return getj9dict(dbname = dbname, manualDB = manualDB, returnDict = returnDict)
+    try:
+        if returnDict == 'both' or returnDict == 'manual':
+            if os.path.isfile(dbLoc + '/{}.dat'.format(manualDB)):
+                with dbm.dumb.open(dbLoc + '/{}'.format(manualDB)) as db:
+                    for k, v in db.items():
+                        retDict[k.decode('utf-8')] = v.decode('utf-8').split('|')
+            else:
+                if returnDict == 'manual':
+                    raise JournalDataBaseError("Manual J9 Database ({0}) missing, to create it run addToDB(dbname = {0})".format(manualDB))
+    except JournalDataBaseError:
+        updatej9DB(dbname = manualDB)
+        return getj9dict(dbname = dbname, manualDB = manualDB, returnDict = returnDict)
     return retDict
 
 def addToDB(abbr = None, dbname = manaulDBname):
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +"""
 +"""
 +from .recordProQuest import *
 +from .proQuestHandlers import *