Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions pyresparser/config.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
[paths]
train = "./data/train.spacy"
dev = "./data/dev.spacy"
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 96
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
rows = [5000,1000,2500,2500,50]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.2
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]
117 changes: 117 additions & 0 deletions pyresparser/create_spacy_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import html
import json
import logging
import os
import os.path as pth
import re
import spacy

from sklearn.model_selection import train_test_split
from spacy.util import filter_spans
from spacy.tokens import DocBin
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)

def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.

Args:
data (list): The data to be cleaned in spaCy JSON format.

Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')

cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
while valid_start < valid_end and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > valid_start and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
if valid_start == valid_end:
continue
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])

return cleaned_data


def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines = []
with open(dataturks_JSON_FilePath, 'r', encoding="utf8") as f:
lines = f.readlines()

for line in lines:
data = json.loads(line)
text = html.unescape(data['content'])
entities = []
if data['annotation'] is not None:
for annotation in data['annotation']:
# only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]

for label in labels:
# dataturks indices are both inclusive [start, end]
# but spacy is not [start, end)
entities.append((
point['start'],
point['end'] + 1,
label
))

training_data.append((text, {"entities": entities}))
return training_data
except Exception:
logging.exception("Unable to process " + dataturks_JSON_FilePath)
return None


def get_train_data(path: str = "traindata.json"):
return trim_entity_spans(convert_dataturks_to_spacy(path))


def save_as_spacy_corpus(
data: list, dest: str = '', dev_size: float = 0.20) -> list:
os.makedirs(dest, exist_ok=True)
nlp = spacy.load('en_core_web_sm')
db_train = DocBin()
db_dev = DocBin()
docs = []
for text, entities in tqdm(data, desc='Processing resumes'):
spans = []
doc = nlp(text)
for start, end, label in entities['entities']:
span = doc.char_span(start, end, label)
if span is None:
continue
spans.append(doc.char_span(start, end, label))
doc.set_ents(filter_spans(spans))
docs.append(doc)
train, dev, _, _ = train_test_split(docs, docs, test_size=dev_size)
for doc in train:
db_train.add(doc)
for doc in dev:
db_dev.add(doc)
db_train.to_disk(pth.join(dest, f'train.spacy'))
db_dev.to_disk(pth.join(dest, f'dev.spacy'))


if __name__ == "__main__":
logging.info('Loading dataturks data...')
data = get_train_data(pth.join(pth.dirname(__file__), 'traindata.json'))
save_as_spacy_corpus(data, dest=pth.join(pth.dirname(__file__), 'data'))
Binary file added pyresparser/data/dev.spacy
Binary file not shown.
Binary file added pyresparser/data/train.spacy
Binary file not shown.
Loading