From 341dd97be28f78776c7ec18ccf8d8a59baf65791 Mon Sep 17 00:00:00 2001 From: RK Date: Tue, 18 Apr 2023 18:38:49 +0800 Subject: [PATCH 1/2] fig bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit np.float 会报错 时间格式的Log文件在Windows中会报错 BertPreTrainedModel, BertModel 导入更新了 --- pybert/callback/earlystopping.py | 21 ++--- pybert/callback/trainingmonitor.py | 2 +- pybert/io/albert_processor.py | 4 +- pybert/io/bert_processor.py | 75 ++++++++--------- pybert/io/xlnet_processor.py | 4 +- pybert/model/bert_for_multi_label.py | 2 +- requirements.txt | 117 ++++++++++++++++++++------- run_xlnet.py | 2 +- 8 files changed, 145 insertions(+), 82 deletions(-) diff --git a/pybert/callback/earlystopping.py b/pybert/callback/earlystopping.py index af0af54..c7023a2 100644 --- a/pybert/callback/earlystopping.py +++ b/pybert/callback/earlystopping.py @@ -1,5 +1,7 @@ import numpy as np from ..common.tools import logger + + class EarlyStopping(object): ''' """Stop training when a monitored quantity has stopped improving. @@ -35,13 +37,14 @@ class EarlyStopping(object): monitor: 计算指标 baseline: 基线 ''' + def __init__(self, - min_delta = 0, - patience = 10, - verbose = 1, - mode = 'min', - monitor = 'loss', - baseline = None): + min_delta=0, + patience=10, + verbose=1, + mode='min', + monitor='loss', + baseline=None): self.baseline = baseline self.patience = patience @@ -49,7 +52,7 @@ def __init__(self, self.min_delta = min_delta self.monitor = monitor - assert mode in ['min','max'] + assert mode in ['min', 'max'] if mode == 'min': self.monitor_op = np.less @@ -70,13 +73,13 @@ def reset(self): else: self.best = np.Inf if self.monitor_op == np.less else -np.Inf - def epoch_step(self,current): + def epoch_step(self, current): if self.monitor_op(current - self.min_delta, self.best): self.best = current self.wait = 0 else: self.wait += 1 if self.wait >= self.patience: - if self.verbose >0: + if self.verbose > 0: logger.info(f"{self.patience} epochs with no improvement after which training will be stopped") self.stop_training = True diff --git a/pybert/callback/trainingmonitor.py b/pybert/callback/trainingmonitor.py index 8096ae7..4d958e7 100644 --- a/pybert/callback/trainingmonitor.py +++ b/pybert/callback/trainingmonitor.py @@ -36,7 +36,7 @@ def epoch_step(self, logs={}): for (k, v) in logs.items(): l = self.H.get(k, []) # np.float32会报错 - if not isinstance(v, np.float): + if not isinstance(v, float): v = round(float(v), 4) l.append(v) self.H[k] = l diff --git a/pybert/io/albert_processor.py b/pybert/io/albert_processor.py index 77153e9..0d1f5bf 100644 --- a/pybert/io/albert_processor.py +++ b/pybert/io/albert_processor.py @@ -93,9 +93,9 @@ def create_examples(self,lines,example_type,cached_examples_file): text_a = line[0] label = line[1] if isinstance(label,str): - label = [np.float(x) for x in label.split(",")] + label = [float(x) for x in label.split(",")] else: - label = [np.float(x) for x in list(label)] + label = [float(x) for x in list(label)] text_b = None example = InputExample(guid = guid,text_a = text_a,text_b=text_b,label= label) examples.append(example) diff --git a/pybert/io/bert_processor.py b/pybert/io/bert_processor.py index 8725534..c260641 100644 --- a/pybert/io/bert_processor.py +++ b/pybert/io/bert_processor.py @@ -7,6 +7,7 @@ from torch.utils.data import TensorDataset from transformers import BertTokenizer + class InputExample(object): def __init__(self, guid, text_a, text_b=None, label=None): """Constructs a InputExample. @@ -19,27 +20,30 @@ def __init__(self, guid, text_a, text_b=None, label=None): label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ - self.guid = guid + self.guid = guid self.text_a = text_a self.text_b = text_b - self.label = label + self.label = label + class InputFeature(object): ''' A single set of features of data. ''' - def __init__(self,input_ids,input_mask,segment_ids,label_id,input_len): - self.input_ids = input_ids - self.input_mask = input_mask + + def __init__(self, input_ids, input_mask, segment_ids, label_id, input_len): + self.input_ids = input_ids + self.input_mask = input_mask self.segment_ids = segment_ids - self.label_id = label_id + self.label_id = label_id self.input_len = input_len + class BertProcessor(object): """Base class for data converters for sequence classification data sets.""" - def __init__(self,vocab_path,do_lower_case): - self.tokenizer = BertTokenizer(vocab_path,do_lower_case) + def __init__(self, vocab_path, do_lower_case): + self.tokenizer = BertTokenizer(vocab_path, do_lower_case) def get_train(self, data_file): """Gets a collection of `InputExample`s for the train set.""" @@ -49,15 +53,15 @@ def get_dev(self, data_file): """Gets a collection of `InputExample`s for the dev set.""" return self.read_data(data_file) - def get_test(self,lines): + def get_test(self, lines): return lines def get_labels(self): """Gets the list of labels for this data set.""" - return ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] + return ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] @classmethod - def read_data(cls, input_file,quotechar = None): + def read_data(cls, input_file, quotechar=None): """Reads a tab separated value file.""" if 'pkl' in str(input_file): lines = load_pickle(input_file) @@ -65,7 +69,7 @@ def read_data(cls, input_file,quotechar = None): lines = input_file return lines - def truncate_seq_pair(self,tokens_a,tokens_b,max_length): + def truncate_seq_pair(self, tokens_a, tokens_b, max_length): # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token @@ -79,33 +83,33 @@ def truncate_seq_pair(self,tokens_a,tokens_b,max_length): else: tokens_b.pop() - def create_examples(self,lines,example_type,cached_examples_file): + def create_examples(self, lines, example_type, cached_examples_file): ''' Creates examples for data ''' - pbar = ProgressBar(n_total = len(lines),desc='create examples') + pbar = ProgressBar(n_total=len(lines), desc='create examples') if cached_examples_file.exists(): logger.info("Loading examples from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: examples = [] - for i,line in enumerate(lines): - guid = '%s-%d'%(example_type,i) + for i, line in enumerate(lines): + guid = '%s-%d' % (example_type, i) text_a = line[0] label = line[1] - if isinstance(label,str): - label = [np.float(x) for x in label.split(",")] + if isinstance(label, str): + label = [float(x) for x in label.split(",")] else: - label = [np.float(x) for x in list(label)] + label = [float(x) for x in list(label)] text_b = None - example = InputExample(guid = guid,text_a = text_a,text_b=text_b,label= label) + example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) pbar(step=i) logger.info("Saving examples into cached file %s", cached_examples_file) torch.save(examples, cached_examples_file) return examples - def create_features(self,examples,max_seq_len,cached_features_file): + def create_features(self, examples, max_seq_len, cached_features_file): ''' # The convention in BERT is: # (a) For sequence pairs: @@ -115,13 +119,13 @@ def create_features(self,examples,max_seq_len,cached_features_file): # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 ''' - pbar = ProgressBar(n_total=len(examples),desc='create features') + pbar = ProgressBar(n_total=len(examples), desc='create features') if cached_features_file.exists(): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: features = [] - for ex_id,example in enumerate(examples): + for ex_id, example in enumerate(examples): tokens_a = self.tokenizer.tokenize(example.text_a) tokens_b = None label_id = example.label @@ -131,7 +135,7 @@ def create_features(self,examples,max_seq_len,cached_features_file): # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" - self.truncate_seq_pair(tokens_a,tokens_b,max_length = max_seq_len - 3) + self.truncate_seq_pair(tokens_a, tokens_b, max_length=max_seq_len - 3) else: # Account for [CLS] and [SEP] with '-2' if len(tokens_a) > max_seq_len - 2: @@ -147,8 +151,8 @@ def create_features(self,examples,max_seq_len,cached_features_file): padding = [0] * (max_seq_len - len(input_ids)) input_len = len(input_ids) - input_ids += padding - input_mask += padding + input_ids += padding + input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_len @@ -163,27 +167,26 @@ def create_features(self,examples,max_seq_len,cached_features_file): logger.info(f"input_mask: {' '.join([str(x) for x in input_mask])}") logger.info(f"segment_ids: {' '.join([str(x) for x in segment_ids])}") - feature = InputFeature(input_ids = input_ids, - input_mask = input_mask, - segment_ids = segment_ids, - label_id = label_id, - input_len = input_len) + feature = InputFeature(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + input_len=input_len) features.append(feature) pbar(step=ex_id) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) return features - def create_dataset(self,features,is_sorted = False): + def create_dataset(self, features, is_sorted=False): # Convert to Tensors and build dataset if is_sorted: logger.info("sorted data by th length of input") - features = sorted(features,key=lambda x:x.input_len,reverse=True) + features = sorted(features, key=lambda x: x.input_len, reverse=True) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in features],dtype=torch.long) + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) all_input_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_input_lens) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_input_lens) return dataset - diff --git a/pybert/io/xlnet_processor.py b/pybert/io/xlnet_processor.py index f2c237c..b85f333 100644 --- a/pybert/io/xlnet_processor.py +++ b/pybert/io/xlnet_processor.py @@ -94,9 +94,9 @@ def create_examples(self,lines,example_type,cached_examples_file): text_a = line[0] label = line[1] if isinstance(label,str): - label = [np.float(x) for x in label.split(",")] + label = [float(x) for x in label.split(",")] else: - label = [np.float(x) for x in list(label)] + label = [float(x) for x in list(label)] text_b = None example = InputExample(guid = guid,text_a = text_a,text_b=text_b,label= label) examples.append(example) diff --git a/pybert/model/bert_for_multi_label.py b/pybert/model/bert_for_multi_label.py index ebb4336..03248a8 100644 --- a/pybert/model/bert_for_multi_label.py +++ b/pybert/model/bert_for_multi_label.py @@ -1,5 +1,5 @@ import torch.nn as nn -from transformers.modeling_bert import BertPreTrainedModel, BertModel +from transformers import BertPreTrainedModel, BertModel class BertForMultiLable(BertPreTrainedModel): def __init__(self, config): diff --git a/requirements.txt b/requirements.txt index 8f8843a..daed6dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,30 +1,87 @@ -boto3==1.9.227 -botocore==1.12.227 -certifi==2019.9.11 -chardet==3.0.4 -Click==7.0 -cycler==0.10.0 -docutils==0.15.2 -idna==2.8 -jmespath==0.9.4 -joblib==0.13.2 -kiwisolver==1.1.0 -matplotlib==3.1.1 -numpy==1.17.2 -pandas==0.25.1 -pillow>=6.2.0 -pyparsing==2.4.2 -python-dateutil==2.8.0 -transformers==2.5.1 -pytz==2019.2 -regex==2019.8.19 -requests==2.22.0 -s3transfer==0.2.1 -sacremoses==0.0.33 -scikit-learn==0.21.3 -scipy==1.3.1 -sentencepiece==0.1.83 -six==1.12.0 -torch==1.0.1 -tqdm==4.35.0 -urllib3==1.25.3 +asttokens==2.2.1 +backcall==0.2.0 +boto3==1.26.115 +botocore==1.29.115 +Bottleneck +certifi==2022.12.7 +cffi +chardet==5.1.0 +charset-normalizer==3.1.0 +click==8.1.3 +colorama +comm==0.1.3 +contourpy==1.0.7 +cycler==0.11.0 +debugpy==1.6.7 +decorator==5.1.1 +executing==1.2.0 +filelock==3.11.0 +flit_core +fonttools==4.39.3 +fsspec +future +huggingface-hub==0.13.4 +idna==3.4 +ipykernel==6.22.0 +ipython==8.12.0 +jedi==0.18.2 +Jinja2==3.1.2 +jmespath==1.0.1 +joblib==1.2.0 +jupyter_client==8.2.0 +jupyter_core==5.3.0 +kiwisolver==1.4.4 +lightning-utilities +MarkupSafe==2.1.2 +matplotlib==3.7.1 +matplotlib-inline==0.1.6 +mkl-fft==1.3.1 +mkl-random +mkl-service==2.4.0 +mpmath==1.3.0 +nest-asyncio==1.5.6 +networkx==3.1 +numexpr +numpy==1.24.2 +packaging==23.1 +pandas +parso==0.8.3 +pickleshare==0.7.5 +Pillow==9.5.0 +platformdirs==3.2.0 +prompt-toolkit==3.0.38 +psutil==5.9.5 +pure-eval==0.2.2 +pycparser +Pygments==2.15.0 +pyparsing==3.0.9 +python-dateutil +pytorch-lightning +pytorch-transformers==1.2.0 +pytz +pywin32==306 +PyYAML +pyzmq==25.0.2 +regex==2023.3.23 +requests==2.28.2 +s3transfer==0.6.0 +sacremoses==0.0.53 +scikit-learn==1.2.2 +scipy==1.10.1 +sentencepiece==0.1.98 +six +stack-data==0.6.2 +sympy==1.11.1 +threadpoolctl==3.1.0 +tokenizers==0.13.3 +torch==2.0.0+cu118 +torchaudio==2.0.1+cu118 +torchmetrics +torchvision==0.15.1+cu118 +tornado==6.3 +tqdm +traitlets==5.9.0 +transformers==4.28.1 +typing_extensions==4.5.0 +urllib3==1.26.15 +wcwidth==0.2.6 \ No newline at end of file diff --git a/run_xlnet.py b/run_xlnet.py index cad633e..51aad2b 100644 --- a/run_xlnet.py +++ b/run_xlnet.py @@ -188,7 +188,7 @@ def main(): parser.add_argument('--fp16', action='store_true') parser.add_argument('--fp16_opt_level', type=str, default='O1') args = parser.parse_args() - init_logger(log_file=config['log_dir'] / f'{args.arch}-{time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())}.log') + init_logger(log_file=config['log_dir'] / f'{args.arch}-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}.log') config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch config['checkpoint_dir'].mkdir(exist_ok=True) # Good practice: save your training arguments together with the trained model From 030eed3c0469215b1a3d89e541728710d992706d Mon Sep 17 00:00:00 2001 From: RK Date: Tue, 18 Apr 2023 18:46:25 +0800 Subject: [PATCH 2/2] update readme.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 补充 predict new data 的脚本 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ebb485..92089e1 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ you need download pretrained bert model and xlnet model. 8. Modify configuration information in `pybert/configs/basic_config.py`(the path of data,...). 9. Run `python run_bert.py --do_data` to preprocess data. 10. Run `python run_bert.py --do_train --save_best --do_lower_case` to fine tuning bert model. -11. Run `run_bert.py --do_test --do_lower_case` to predict new data. +11. Run `python run_bert.py --do_test --do_lower_case` to predict new data. ### training