chore: mistral-7b [tested on single GPU]

FardinHash · FardinHash · commit 2b2e7f3ce3c2 · 2024-05-12T15:16:47.000Z
diff --git a/Mistral-7B/classification.py b/Mistral-7B/classification.py
@@ -0,0 +1,223 @@
+import os
+import random
+import functools
+import csv
+import numpy as np
+import torch
+import torch.nn.functional as F
+from sklearn.metrics import f1_score, multilabel_confusion_matrix, roc_curve, auc
+from skmultilearn.model_selection import iterative_train_test_split
+from datasets import Dataset, DatasetDict
+from peft import (
+    LoraConfig,
+    prepare_model_for_kbit_training,
+    get_peft_model
+)
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments,
+    Trainer
+)
+import matplotlib.pyplot as plt
+import seaborn as sns
+from itertools import cycle
+
+def tokenize_examples(examples, tokenizer):
+    tokenized_inputs = tokenizer(examples['text'])
+    tokenized_inputs['labels'] = examples['labels']
+    return tokenized_inputs
+
+# define custom batch preprocessor
+def collate_fn(batch, tokenizer):
+    dict_keys = ['input_ids', 'attention_mask', 'labels']
+    d = {k: [dic[k] for dic in batch] for k in dict_keys}
+    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
+        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
+    )
+    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
+        d['attention_mask'], batch_first=True, padding_value=0
+    )
+    d['labels'] = torch.stack(d['labels'])
+    return d
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
+    labels = labels.numpy()
+    
+    # Calculate F1 scores
+    f1_micro = f1_score(labels, predictions, average='micro')
+    f1_macro = f1_score(labels, predictions, average='macro')
+    f1_weighted = f1_score(labels, predictions, average='weighted')
+
+    # Plot Confusion Matrix for each label
+    conf_matrices = multilabel_confusion_matrix(labels, predictions)
+    fig, ax = plt.subplots(1, len(conf_matrices), figsize=(15, 5))
+    if len(conf_matrices) > 1:
+        for idx, cm in enumerate(conf_matrices):
+            plot_confusion_matrix(cm, idx, ax[idx])
+    else:
+        plot_confusion_matrix(conf_matrices[0], 0, ax)
+    plt.tight_layout()
+    plt.show()
+
+    # Plot ROC Curves
+    plot_multilabel_roc(labels, torch.sigmoid(torch.tensor(logits)).numpy(), num_classes=labels.shape[1])
+    
+    return {'f1_micro': f1_micro, 'f1_macro': f1_macro, 'f1_weighted': f1_weighted}
+
+# create custom trainer class to be able to pass label weights and calculate mutilabel loss
+class CustomTrainer(Trainer):
+
+    def __init__(self, label_weights, **kwargs):
+        super().__init__(**kwargs)
+        self.label_weights = label_weights
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+
+        # compute custom loss
+        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
+        return (loss, outputs) if return_outputs else loss
+
+# set random seed
+random.seed(0)
+
+# load data
+with open('train.csv', newline='') as csvfile:
+    data = list(csv.reader(csvfile, delimiter=','))
+    header_row = data.pop(0)
+
+# shuffle data
+random.shuffle(data)
+
+# reshape
+idx, text, labels = list(zip(*[(int(row[0]), f'Title: {row[1].strip()}\n\nAbstract: {row[2].strip()}', row[3:]) for row in data]))
+labels = np.array(labels, dtype=int)
+
+# create label weights
+label_weights = 1 - labels.sum(axis=0) / labels.sum()
+
+# stratified train test split for multilabel ds
+row_ids = np.arange(len(labels))
+train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
+x_train = [text[i] for i in train_idx.flatten()]
+x_val = [text[i] for i in val_idx.flatten()]
+
+# create hf dataset
+ds = DatasetDict({
+    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
+    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
+})
+
+# model name
+model_name = 'mistralai/Mistral-7B-v0.1'
+
+# preprocess dataset with tokenizer
+def tokenize_examples(examples, tokenizer):
+    tokenized_inputs = tokenizer(examples['text'])
+    tokenized_inputs['labels'] = examples['labels']
+    return tokenized_inputs
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
+tokenized_ds = tokenized_ds.with_format('torch')
+
+# quantization config
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,  # enable 4-bit quantization
+    bnb_4bit_quant_type='nf4',  # information theoretically optimal dtype for normally distributed weights
+    bnb_4bit_use_double_quant=True,  # quantize quantized weights //insert xzibit meme
+    bnb_4bit_compute_dtype=torch.bfloat16  # optimized fp format for ML
+)
+
+# lora config
+lora_config = LoraConfig(
+    r=16,  # the dimension of the low-rank matrices
+    lora_alpha=8,  # scaling factor for LoRA activations vs pre-trained weight activations
+    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
+    lora_dropout=0.05,  # dropout probability of the LoRA layers
+    bias='none',  # whether to train bias weights, so 'none' for attention layers
+    task_type='SEQ_CLS'
+)
+
+# load model
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_name,
+    quantization_config=quantization_config,
+    num_labels=labels.shape[1]
+)
+model = prepare_model_for_kbit_training(model)
+model = get_peft_model(model, lora_config)
+model.config.pad_token_id = tokenizer.pad_token_id
+
+# define training args
+training_args = TrainingArguments(
+    output_dir = 'multilabel_classification',
+    learning_rate = 1e-4,
+    per_device_train_batch_size = 8, # tested with 16gb gpu ram
+    per_device_eval_batch_size = 8,
+    num_train_epochs = 10,
+    weight_decay = 0.01,
+    evaluation_strategy = 'epoch',
+    save_strategy = 'epoch',
+    load_best_model_at_end = True
+)
+
+# train
+trainer = CustomTrainer(
+    model = model,
+    args = training_args,
+    train_dataset = tokenized_ds['train'],
+    eval_dataset = tokenized_ds['val'],
+    tokenizer = tokenizer,
+    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
+    compute_metrics = compute_metrics,
+    label_weights = torch.tensor(label_weights, device=model.device)
+)
+
+trainer.train()
+
+# save model
+peft_model_id = 'multilabel_mistral'
+trainer.model.save_pretrained(peft_model_id)
+tokenizer.save_pretrained(peft_model_id)
+
+# plotting
+def plot_confusion_matrix(conf_matrix, class_idx, ax, class_names=["Absent", "Present"]):
+    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
+    ax.set_xlabel('Predicted labels')
+    ax.set_ylabel('True labels')
+    ax.set_title(f'Class {class_idx}')
+    ax.xaxis.set_ticklabels(class_names)
+    ax.yaxis.set_ticklabels(class_names)
+
+
+def plot_multilabel_roc(labels, predictions, num_classes):
+    fpr = dict()
+    tpr = dict()
+    roc_auc = dict()
+    for i in range(num_classes):
+        fpr[i], tpr[i], _ = roc_curve(labels[:, i], predictions[:, i])
+        roc_auc[i] = auc(fpr[i], tpr[i])
+
+    colors = cycle(['blue', 'red', 'green', 'yellow', 'cyan', 'magenta', 'black'])
+    plt.figure(figsize=(10, 8))
+    for i, color in zip(range(num_classes), colors):
+        plt.plot(fpr[i], tpr[i], color=color, lw=2,
+                 label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')
+    plt.plot([0, 1], [0, 1], 'k--', lw=2)
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver Operating Characteristic for Multi-label')
+    plt.legend(loc="lower right")
+    plt.show()