From 341dd97be28f78776c7ec18ccf8d8a59baf65791 Mon Sep 17 00:00:00 2001
From: RK <earth@mail.ustc.edu.cn>
Date: Tue, 18 Apr 2023 18:38:49 +0800
Subject: [PATCH 1/2] fig bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

np.float 会报错
时间格式的Log文件在Windows中会报错
BertPreTrainedModel, BertModel 导入更新了
---
 pybert/callback/earlystopping.py     |  21 ++---
 pybert/callback/trainingmonitor.py   |   2 +-
 pybert/io/albert_processor.py        |   4 +-
 pybert/io/bert_processor.py          |  75 ++++++++---------
 pybert/io/xlnet_processor.py         |   4 +-
 pybert/model/bert_for_multi_label.py |   2 +-
 requirements.txt                     | 117 ++++++++++++++++++++-------
 run_xlnet.py                         |   2 +-
 8 files changed, 145 insertions(+), 82 deletions(-)

diff --git a/pybert/callback/earlystopping.py b/pybert/callback/earlystopping.py
index af0af54..c7023a2 100644
--- a/pybert/callback/earlystopping.py
+++ b/pybert/callback/earlystopping.py
@@ -1,5 +1,7 @@
 import numpy as np
 from ..common.tools import logger
+
+
 class EarlyStopping(object):
     '''
         """Stop training when a monitored quantity has stopped improving.
@@ -35,13 +37,14 @@ class EarlyStopping(object):
         monitor: 计算指标
         baseline: 基线
     '''
+
     def __init__(self,
-                 min_delta = 0,
-                 patience  = 10,
-                 verbose   = 1,
-                 mode      = 'min',
-                 monitor   = 'loss',
-                 baseline  = None):
+                 min_delta=0,
+                 patience=10,
+                 verbose=1,
+                 mode='min',
+                 monitor='loss',
+                 baseline=None):
 
         self.baseline = baseline
         self.patience = patience
@@ -49,7 +52,7 @@ def __init__(self,
         self.min_delta = min_delta
         self.monitor = monitor
 
-        assert mode in ['min','max']
+        assert mode in ['min', 'max']
 
         if mode == 'min':
             self.monitor_op = np.less
@@ -70,13 +73,13 @@ def reset(self):
         else:
             self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
-    def epoch_step(self,current):
+    def epoch_step(self, current):
         if self.monitor_op(current - self.min_delta, self.best):
             self.best = current
             self.wait = 0
         else:
             self.wait += 1
             if self.wait >= self.patience:
-                if self.verbose >0:
+                if self.verbose > 0:
                     logger.info(f"{self.patience} epochs with no improvement after which training will be stopped")
                 self.stop_training = True
diff --git a/pybert/callback/trainingmonitor.py b/pybert/callback/trainingmonitor.py
index 8096ae7..4d958e7 100644
--- a/pybert/callback/trainingmonitor.py
+++ b/pybert/callback/trainingmonitor.py
@@ -36,7 +36,7 @@ def epoch_step(self, logs={}):
         for (k, v) in logs.items():
             l = self.H.get(k, [])
             # np.float32会报错
-            if not isinstance(v, np.float):
+            if not isinstance(v, float):
                 v = round(float(v), 4)
             l.append(v)
             self.H[k] = l
diff --git a/pybert/io/albert_processor.py b/pybert/io/albert_processor.py
index 77153e9..0d1f5bf 100644
--- a/pybert/io/albert_processor.py
+++ b/pybert/io/albert_processor.py
@@ -93,9 +93,9 @@ def create_examples(self,lines,example_type,cached_examples_file):
                 text_a = line[0]
                 label = line[1]
                 if isinstance(label,str):
-                    label = [np.float(x) for x in label.split(",")]
+                    label = [float(x) for x in label.split(",")]
                 else:
-                    label = [np.float(x) for x in list(label)]
+                    label = [float(x) for x in list(label)]
                 text_b = None
                 example = InputExample(guid = guid,text_a = text_a,text_b=text_b,label= label)
                 examples.append(example)
diff --git a/pybert/io/bert_processor.py b/pybert/io/bert_processor.py
index 8725534..c260641 100644
--- a/pybert/io/bert_processor.py
+++ b/pybert/io/bert_processor.py
@@ -7,6 +7,7 @@
 from torch.utils.data import TensorDataset
 from transformers import BertTokenizer
 
+
 class InputExample(object):
     def __init__(self, guid, text_a, text_b=None, label=None):
         """Constructs a InputExample.
@@ -19,27 +20,30 @@ def __init__(self, guid, text_a, text_b=None, label=None):
             label: (Optional) string. The label of the example. This should be
             specified for train and dev examples, but not for test examples.
         """
-        self.guid   = guid
+        self.guid = guid
         self.text_a = text_a
         self.text_b = text_b
-        self.label  = label
+        self.label = label
+
 
 class InputFeature(object):
     '''
     A single set of features of data.
     '''
-    def __init__(self,input_ids,input_mask,segment_ids,label_id,input_len):
-        self.input_ids   = input_ids
-        self.input_mask  = input_mask
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, input_len):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
         self.segment_ids = segment_ids
-        self.label_id    = label_id
+        self.label_id = label_id
         self.input_len = input_len
 
+
 class BertProcessor(object):
     """Base class for data converters for sequence classification data sets."""
 
-    def __init__(self,vocab_path,do_lower_case):
-        self.tokenizer = BertTokenizer(vocab_path,do_lower_case)
+    def __init__(self, vocab_path, do_lower_case):
+        self.tokenizer = BertTokenizer(vocab_path, do_lower_case)
 
     def get_train(self, data_file):
         """Gets a collection of `InputExample`s for the train set."""
@@ -49,15 +53,15 @@ def get_dev(self, data_file):
         """Gets a collection of `InputExample`s for the dev set."""
         return self.read_data(data_file)
 
-    def get_test(self,lines):
+    def get_test(self, lines):
         return lines
 
     def get_labels(self):
         """Gets the list of labels for this data set."""
-        return ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
+        return ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
 
     @classmethod
-    def read_data(cls, input_file,quotechar = None):
+    def read_data(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
         if 'pkl' in str(input_file):
             lines = load_pickle(input_file)
@@ -65,7 +69,7 @@ def read_data(cls, input_file,quotechar = None):
             lines = input_file
         return lines
 
-    def truncate_seq_pair(self,tokens_a,tokens_b,max_length):
+    def truncate_seq_pair(self, tokens_a, tokens_b, max_length):
         # This is a simple heuristic which will always truncate the longer sequence
         # one token at a time. This makes more sense than truncating an equal percent
         # of tokens from each, since if one sequence is very short then each token
@@ -79,33 +83,33 @@ def truncate_seq_pair(self,tokens_a,tokens_b,max_length):
             else:
                 tokens_b.pop()
 
-    def create_examples(self,lines,example_type,cached_examples_file):
+    def create_examples(self, lines, example_type, cached_examples_file):
         '''
         Creates examples for data
         '''
-        pbar = ProgressBar(n_total = len(lines),desc='create examples')
+        pbar = ProgressBar(n_total=len(lines), desc='create examples')
         if cached_examples_file.exists():
             logger.info("Loading examples from cached file %s", cached_examples_file)
             examples = torch.load(cached_examples_file)
         else:
             examples = []
-            for i,line in enumerate(lines):
-                guid = '%s-%d'%(example_type,i)
+            for i, line in enumerate(lines):
+                guid = '%s-%d' % (example_type, i)
                 text_a = line[0]
                 label = line[1]
-                if isinstance(label,str):
-                    label = [np.float(x) for x in label.split(",")]
+                if isinstance(label, str):
+                    label = [float(x) for x in label.split(",")]
                 else:
-                    label = [np.float(x) for x in list(label)]
+                    label = [float(x) for x in list(label)]
                 text_b = None
-                example = InputExample(guid = guid,text_a = text_a,text_b=text_b,label= label)
+                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                 examples.append(example)
                 pbar(step=i)
             logger.info("Saving examples into cached file %s", cached_examples_file)
             torch.save(examples, cached_examples_file)
         return examples
 
-    def create_features(self,examples,max_seq_len,cached_features_file):
+    def create_features(self, examples, max_seq_len, cached_features_file):
         '''
         # The convention in BERT is:
         # (a) For sequence pairs:
@@ -115,13 +119,13 @@ def create_features(self,examples,max_seq_len,cached_features_file):
         #  tokens:   [CLS] the dog is hairy . [SEP]
         #  type_ids:   0   0   0   0  0     0   0
         '''
-        pbar = ProgressBar(n_total=len(examples),desc='create features')
+        pbar = ProgressBar(n_total=len(examples), desc='create features')
         if cached_features_file.exists():
             logger.info("Loading features from cached file %s", cached_features_file)
             features = torch.load(cached_features_file)
         else:
             features = []
-            for ex_id,example in enumerate(examples):
+            for ex_id, example in enumerate(examples):
                 tokens_a = self.tokenizer.tokenize(example.text_a)
                 tokens_b = None
                 label_id = example.label
@@ -131,7 +135,7 @@ def create_features(self,examples,max_seq_len,cached_features_file):
                     # Modifies `tokens_a` and `tokens_b` in place so that the total
                     # length is less than the specified length.
                     # Account for [CLS], [SEP], [SEP] with "- 3"
-                    self.truncate_seq_pair(tokens_a,tokens_b,max_length = max_seq_len - 3)
+                    self.truncate_seq_pair(tokens_a, tokens_b, max_length=max_seq_len - 3)
                 else:
                     # Account for [CLS] and [SEP] with '-2'
                     if len(tokens_a) > max_seq_len - 2:
@@ -147,8 +151,8 @@ def create_features(self,examples,max_seq_len,cached_features_file):
                 padding = [0] * (max_seq_len - len(input_ids))
                 input_len = len(input_ids)
 
-                input_ids   += padding
-                input_mask  += padding
+                input_ids += padding
+                input_mask += padding
                 segment_ids += padding
 
                 assert len(input_ids) == max_seq_len
@@ -163,27 +167,26 @@ def create_features(self,examples,max_seq_len,cached_features_file):
                     logger.info(f"input_mask: {' '.join([str(x) for x in input_mask])}")
                     logger.info(f"segment_ids: {' '.join([str(x) for x in segment_ids])}")
 
-                feature = InputFeature(input_ids = input_ids,
-                                       input_mask = input_mask,
-                                       segment_ids = segment_ids,
-                                       label_id = label_id,
-                                       input_len = input_len)
+                feature = InputFeature(input_ids=input_ids,
+                                       input_mask=input_mask,
+                                       segment_ids=segment_ids,
+                                       label_id=label_id,
+                                       input_len=input_len)
                 features.append(feature)
                 pbar(step=ex_id)
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
         return features
 
-    def create_dataset(self,features,is_sorted = False):
+    def create_dataset(self, features, is_sorted=False):
         # Convert to Tensors and build dataset
         if is_sorted:
             logger.info("sorted data by th length of input")
-            features = sorted(features,key=lambda x:x.input_len,reverse=True)
+            features = sorted(features, key=lambda x: x.input_len, reverse=True)
         all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-        all_label_ids = torch.tensor([f.label_id for f in features],dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
         all_input_lens = torch.tensor([f.input_len for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_input_lens)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_input_lens)
         return dataset
-
diff --git a/pybert/io/xlnet_processor.py b/pybert/io/xlnet_processor.py
index f2c237c..b85f333 100644
--- a/pybert/io/xlnet_processor.py
+++ b/pybert/io/xlnet_processor.py
@@ -94,9 +94,9 @@ def create_examples(self,lines,example_type,cached_examples_file):
                 text_a = line[0]
                 label = line[1]
                 if isinstance(label,str):
-                    label = [np.float(x) for x in label.split(",")]
+                    label = [float(x) for x in label.split(",")]
                 else:
-                    label = [np.float(x) for x in list(label)]
+                    label = [float(x) for x in list(label)]
                 text_b = None
                 example = InputExample(guid = guid,text_a = text_a,text_b=text_b,label= label)
                 examples.append(example)
diff --git a/pybert/model/bert_for_multi_label.py b/pybert/model/bert_for_multi_label.py
index ebb4336..03248a8 100644
--- a/pybert/model/bert_for_multi_label.py
+++ b/pybert/model/bert_for_multi_label.py
@@ -1,5 +1,5 @@
 import torch.nn as nn
-from transformers.modeling_bert import BertPreTrainedModel, BertModel
+from transformers import BertPreTrainedModel, BertModel
 
 class BertForMultiLable(BertPreTrainedModel):
     def __init__(self, config):
diff --git a/requirements.txt b/requirements.txt
index 8f8843a..daed6dc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,30 +1,87 @@
-boto3==1.9.227
-botocore==1.12.227
-certifi==2019.9.11
-chardet==3.0.4
-Click==7.0
-cycler==0.10.0
-docutils==0.15.2
-idna==2.8
-jmespath==0.9.4
-joblib==0.13.2
-kiwisolver==1.1.0
-matplotlib==3.1.1
-numpy==1.17.2
-pandas==0.25.1
-pillow>=6.2.0
-pyparsing==2.4.2
-python-dateutil==2.8.0
-transformers==2.5.1
-pytz==2019.2
-regex==2019.8.19
-requests==2.22.0
-s3transfer==0.2.1
-sacremoses==0.0.33
-scikit-learn==0.21.3
-scipy==1.3.1
-sentencepiece==0.1.83
-six==1.12.0
-torch==1.0.1
-tqdm==4.35.0
-urllib3==1.25.3
+asttokens==2.2.1
+backcall==0.2.0
+boto3==1.26.115
+botocore==1.29.115
+Bottleneck
+certifi==2022.12.7
+cffi
+chardet==5.1.0
+charset-normalizer==3.1.0
+click==8.1.3
+colorama
+comm==0.1.3
+contourpy==1.0.7
+cycler==0.11.0
+debugpy==1.6.7
+decorator==5.1.1
+executing==1.2.0
+filelock==3.11.0
+flit_core
+fonttools==4.39.3
+fsspec
+future
+huggingface-hub==0.13.4
+idna==3.4
+ipykernel==6.22.0
+ipython==8.12.0
+jedi==0.18.2
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.2.0
+jupyter_client==8.2.0
+jupyter_core==5.3.0
+kiwisolver==1.4.4
+lightning-utilities
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mkl-fft==1.3.1
+mkl-random
+mkl-service==2.4.0
+mpmath==1.3.0
+nest-asyncio==1.5.6
+networkx==3.1
+numexpr
+numpy==1.24.2
+packaging==23.1
+pandas
+parso==0.8.3
+pickleshare==0.7.5
+Pillow==9.5.0
+platformdirs==3.2.0
+prompt-toolkit==3.0.38
+psutil==5.9.5
+pure-eval==0.2.2
+pycparser
+Pygments==2.15.0
+pyparsing==3.0.9
+python-dateutil
+pytorch-lightning
+pytorch-transformers==1.2.0
+pytz
+pywin32==306
+PyYAML
+pyzmq==25.0.2
+regex==2023.3.23
+requests==2.28.2
+s3transfer==0.6.0
+sacremoses==0.0.53
+scikit-learn==1.2.2
+scipy==1.10.1
+sentencepiece==0.1.98
+six
+stack-data==0.6.2
+sympy==1.11.1
+threadpoolctl==3.1.0
+tokenizers==0.13.3
+torch==2.0.0+cu118
+torchaudio==2.0.1+cu118
+torchmetrics
+torchvision==0.15.1+cu118
+tornado==6.3
+tqdm
+traitlets==5.9.0
+transformers==4.28.1
+typing_extensions==4.5.0
+urllib3==1.26.15
+wcwidth==0.2.6
\ No newline at end of file
diff --git a/run_xlnet.py b/run_xlnet.py
index cad633e..51aad2b 100644
--- a/run_xlnet.py
+++ b/run_xlnet.py
@@ -188,7 +188,7 @@ def main():
     parser.add_argument('--fp16', action='store_true')
     parser.add_argument('--fp16_opt_level', type=str, default='O1')
     args = parser.parse_args()
-    init_logger(log_file=config['log_dir'] / f'{args.arch}-{time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())}.log')
+    init_logger(log_file=config['log_dir'] / f'{args.arch}-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}.log')
     config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch
     config['checkpoint_dir'].mkdir(exist_ok=True)
     # Good practice: save your training arguments together with the trained model

From 030eed3c0469215b1a3d89e541728710d992706d Mon Sep 17 00:00:00 2001
From: RK <earth@mail.ustc.edu.cn>
Date: Tue, 18 Apr 2023 18:46:25 +0800
Subject: [PATCH 2/2] update readme.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

补充 predict new data 的脚本
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4ebb485..92089e1 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ you need download pretrained bert model and xlnet model.
 8. Modify configuration information in `pybert/configs/basic_config.py`(the path of data,...).
 9. Run `python run_bert.py --do_data` to preprocess data.
 10. Run `python run_bert.py --do_train --save_best --do_lower_case` to fine tuning bert model.
-11. Run `run_bert.py --do_test --do_lower_case` to predict new data.
+11. Run `python run_bert.py --do_test --do_lower_case` to predict new data.
 
 ### training