diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-peft-quantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-peft-quantization.ipynb new file mode 100644 index 0000000000..a8c3dba3ca --- /dev/null +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-peft-quantization.ipynb @@ -0,0 +1,12060 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a59f475d-d843-46bc-b75e-10984b687ed3", + "metadata": {}, + "source": [ + "# LLM Federated Finetuning with PEFT and Quantization" + ] + }, + { + "cell_type": "markdown", + "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "This tutorial demonstrates how to efficiently fine-tune Microsoft's Phi-4 model (7B parameter variant) in a federated learning workflow using OpenFL framework with advanced techniques for memory optimization and performance enhancement. The approach combines:\n", + "\n", + "### Memory Optimization Techniques\n", + "- **Parameter-Efficient Fine-Tuning (PEFT)**: Using Low-Rank Adaptation (LoRA) to fine-tune only a small subset of model parameters\n", + "- **Quantization**: Comparing 4-bit (NF4) and 8-bit quantization approaches with QLoRA to reduce memory footprint\n", + "- **Gradient Checkpointing**: Trading computation for memory by recomputing activations during backpropagation\n", + "\n", + "### Training Enhancements\n", + "- **Partial Round Updates**: Breaking each global round into partial updates for more frequent knowledge sharing\n", + "- **Fixed Training Steps**: Using a fixed number of training steps (100) equivalent to one epoch\n", + "- **Optimizer State Preservation**: Maintaining optimizer momentum across federation rounds\n", + "- **Memory Usage Tracking**: Detailed monitoring of GPU/CPU memory consumption across training phases\n", + "\n", + "### Federated Learning Architecture\n", + "- **Server-Client Model**: Central aggregator and multiple collaborators (simulated locally)\n", + "- **Federated Averaging**: Weighted parameter averaging between collaborator models\n", + "- **Metrics Visualization**: Tracking and comparing training loss, validation loss, and memory usage\n", + "\n", + "The tutorial implements a complete workflow that addresses common challenges in federated fine-tuning of large language models, including memory constraints, training efficiency, and performance metrics tracking across heterogeneous clients." + ] + }, + { + "cell_type": "markdown", + "id": "7241cc9a", + "metadata": {}, + "source": [ + "Before running the notebook make sure to install NVIDIA drivers using the below command\n", + "```\n", + "sudo apt update \n", + "sudo apt install -y nvidia-driver-550 \n", + "sudo reboot\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" + ] + }, + { + "cell_type": "markdown", + "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a3e6c3f4-dec3-4d3a-97cb-5b35bec06046", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install seaborn matplotlib pandas -q" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2025-05-21 08:21:48,953\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " BitsAndBytesConfig,\n", + " TrainingArguments\n", + ")\n", + "from peft import (\n", + " LoraConfig,\n", + " get_peft_model,\n", + " prepare_model_for_kbit_training,\n", + " PeftModel\n", + ")\n", + "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # Added this import\n", + "from datasets import load_dataset\n", + "from trl import SFTTrainer\n", + "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", + "from openfl.experimental.workflow.placement import aggregator, collaborator\n", + "from openfl.experimental.workflow.runtime import LocalRuntime\n", + "import numpy as np\n", + "from transformers.trainer_callback import PrinterCallback\n", + "import transformers\n", + "import gc\n", + "import psutil\n", + "\n", + "# Import our utility functions\n", + "from openfl.utilities.phi_utils import (\n", + " get_gpu_memory_info,\n", + " MemoryTracker,\n", + " plot_memory_metrics,\n", + " plot_loss_metrics,\n", + " plot_aggregated_metrics\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "74fed8f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Memory optimization setup\n", + "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n", + "os.environ[\"TRANSFORMERS_ATTN_IMPLEMENTATION\"] = \"flash_attention_2\"\n", + "\n", + "def clear_gpu():\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + "\n", + "clear_gpu()" + ] + }, + { + "cell_type": "markdown", + "id": "8fa0941e-5fd7-401b-9cc7-0beb5a2a3621", + "metadata": {}, + "source": [ + "## Acquiring and preprocessing dataset\n", + "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a50ae4a4-628d-4f45-a9fc-c5c437df229e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset already exists locally.\n" + ] + } + ], + "source": [ + "# Import libraries needed for downloading and verifying the dataset\n", + "import hashlib\n", + "import requests\n", + "\n", + "def file_checksum(file_path, algorithm=\"sha256\"):\n", + " \"\"\"\n", + " Calculate the checksum of a file using the specified hashing algorithm.\n", + " \n", + " Args:\n", + " file_path (str): The path to the file for which the checksum is to be calculated.\n", + " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", + " \n", + " Returns:\n", + " str: The calculated checksum of the file.\n", + " \"\"\"\n", + " hash_func = hashlib.new(algorithm)\n", + " with open(file_path, \"rb\") as f:\n", + " for chunk in iter(lambda: f.read(4096), b\"\"):\n", + " hash_func.update(chunk)\n", + " return hash_func.hexdigest()\n", + "\n", + "\n", + "# Download the dataset if it doesn't exist locally\n", + "if not os.path.exists(\"math_10k.json\"):\n", + " print(\"Downloading math_10k.json dataset...\")\n", + " r = requests.get(\n", + " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", + " )\n", + " with open(\n", + " \"math_10k.json\",\n", + " \"wb\",\n", + " ) as f:\n", + " f.write(r.content)\n", + " print(\"Download complete.\")\n", + "\n", + " # Verify the integrity of the downloaded file\n", + " actual_checksum = file_checksum(\"math_10k.json\")\n", + " expected_checksum = \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", + " if actual_checksum != expected_checksum:\n", + " raise ValueError(\n", + " \"Checksum verification failed. The file may have been altered.\"\n", + " )\n", + " print(\"Checksum verification successful.\")\n", + "else:\n", + " print(\"Dataset already exists locally.\")\n", + "\n", + "# Set the dataset path to be used later\n", + "dataset_name = \"math_10k.json\"" + ] + }, + { + "cell_type": "markdown", + "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "eada9809-468a-47c6-9b03-55aa887c9487", + "metadata": {}, + "outputs": [], + "source": [ + "# Model and dataset\n", + "model_name = \"microsoft/phi-4\"\n", + "dataset_name = \"math_10k.json\"\n", + "\n", + "# 4-bit QLoRA configuration\n", + "bnb_config_4bit = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_use_double_quant=True, # Enable double quantization for further memory saving\n", + ")\n", + "\n", + "# 8-bit QLoRA configuration with more aggressive memory savings\n", + "bnb_config_8bit = BitsAndBytesConfig(\n", + " load_in_8bit=True,\n", + " llm_int8_enable_fp32_cpu_offload=True,\n", + " llm_int8_skip_modules=['lm_head'],\n", + " llm_int8_threshold=6.0,\n", + " llm_int8_has_fp16_weight=False,\n", + ")\n", + "\n", + "# Active quantization config (will be set to either 4-bit or 8-bit)\n", + "bnb_config = bnb_config_4bit # Default to 4-bit\n", + "\n", + "# LoRA configuration - reduce parameters to save memory\n", + "peft_config = LoraConfig(\n", + " r=4, # Reduced from 8 to save memory\n", + " lora_alpha=16,\n", + " lora_dropout=0.01,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " target_modules=\"all-linear\",\n", + ")\n", + "\n", + "# Training configuration with memory optimizations\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./results\",\n", + " # Reduced steps for testing\n", + " max_steps=50, # Reduced from 100 to 50 to save memory\n", + " per_device_train_batch_size=1,\n", + " per_device_eval_batch_size=1,\n", + " gradient_accumulation_steps=8, # Increased from 4 to 8 to reduce memory pressure\n", + " optim=\"adamw_torch_fused\",\n", + " # More frequent saving and logging\n", + " save_steps=25,\n", + " logging_steps=5,\n", + " learning_rate=3e-4,\n", + " weight_decay=0.001,\n", + " fp16=False,\n", + " bf16=True,\n", + " max_grad_norm=0.5,\n", + " warmup_ratio=0.02,\n", + " lr_scheduler_type=\"cosine\",\n", + " gradient_checkpointing=True,\n", + " report_to=\"none\",\n", + " # Enable memory optimization options\n", + " deepspeed=None, # Not using DeepSpeed but enabling other memory optimizations\n", + " optim_target_modules=[\"c_attn\", \"c_proj\"], # Optimize specific modules\n", + " # Add auto memory optimization flag\n", + " auto_find_batch_size=True # Automatically find the largest batch size that fits in memory\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", + "metadata": {}, + "source": [ + "## Load and Prepare Model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.31it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 13,926,400 || all params: 14,673,433,600 || trainable%: 0.0949\n" + ] + } + ], + "source": [ + "# Load tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "# Load model with quantization\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + " trust_remote_code=True\n", + ")\n", + "\n", + "# Prepare model for k-bit training\n", + "model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n", + "\n", + "# Apply LoRA\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", + "metadata": {}, + "source": [ + "## Load and Prepare Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", + "metadata": {}, + "outputs": [], + "source": [ + "def format_prompt(example):\n", + " if example[\"input\"]:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Input:\n", + "{example['input']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + " else:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + "\n", + "# Load dataset\n", + "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n", + "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n", + "\n", + "# Split dataset\n", + "dataset = dataset.train_test_split(test_size=0.1)\n", + "train_dataset = dataset[\"train\"]\n", + "eval_dataset = dataset[\"test\"]" + ] + }, + { + "cell_type": "markdown", + "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", + "metadata": {}, + "source": [ + "## Enhanced Training with SFTTrainer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Map: 100%|██████████| 8927/8927 [00:02<00:00, 3310.91 examples/s]\n", + "Map: 100%|██████████| 992/992 [00:00<00:00, 3267.10 examples/s]\n", + "max_steps is given, it will override any value given in num_train_epochs\n" + ] + } + ], + "source": [ + "# Create SFTTrainer\n", + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " peft_config=peft_config,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=512, # Reduced from 1024 to save memory\n", + " tokenizer=tokenizer,\n", + " args=training_args\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "810eb75e", + "metadata": {}, + "source": [ + "## Federated Averaging Function" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "58298e8e-ab9e-4377-966e-143823441697", + "metadata": {}, + "outputs": [], + "source": [ + "def FedAvg(peft_params, model, weights=None):\n", + " \"\"\"\n", + " Perform Federated Averaging (FedAvg) on the model parameters.\n", + "\n", + " Parameters:\n", + " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n", + " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n", + " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", + "\n", + " Returns:\n", + " torch.nn.Module: The model with the averaged parameters applied.\n", + " \"\"\"\n", + " state_dicts = peft_params\n", + " state_dict = get_peft_model_state_dict(model)\n", + " for key in peft_params[0]:\n", + " dtype = state_dicts[0][key].dtype\n", + " state_dict[key] = torch.from_numpy(\n", + " np.average(\n", + " [state[key].cpu().to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n", + " )\n", + " ).to(dtype)\n", + " set_peft_model_state_dict(model, state_dict)\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", + "metadata": {}, + "source": [ + "## Federated Learning Workflow" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aggregator step \"start\" registered\n", + "Collaborator step \"aggregated_model_validation\" registered\n", + "Collaborator step \"train\" registered\n", + "Collaborator step \"local_model_validation\" registered\n", + "Aggregator step \"join\" registered\n", + "Aggregator step \"end\" registered\n" + ] + } + ], + "source": [ + "# Import the required PrinterCallback for proper initialization/removal\n", + "from transformers.trainer_callback import PrinterCallback\n", + "import transformers\n", + "import gc\n", + "import psutil\n", + "import os\n", + "import math\n", + "import time\n", + "\n", + "class FederatedFlow(FLSpec):\n", + " def __init__(self, model=None, optimizer=None, rounds=3, quant_type=\"4bit\", **kwargs):\n", + " \"\"\"\n", + " Initialize the class with the given model, optimizer, and training parameters.\n", + "\n", + " Parameters:\n", + " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n", + " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n", + " rounds (int, optional): The number of rounds for training or processing (default is 3).\n", + " quant_type (str, optional): Quantization type, either \"4bit\" or \"8bit\".\n", + " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n", + "\n", + " Raises:\n", + " ValueError: If no model is provided.\n", + " \"\"\"\n", + " super().__init__(**kwargs)\n", + " if model is not None:\n", + " self.model = model\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " self.optimizer = optimizer\n", + " else:\n", + " raise ValueError(\"No model inputted\")\n", + "\n", + " self.rounds = rounds\n", + " self.quant_type = quant_type\n", + " # Initialize histories for tracking metrics over rounds\n", + " self.average_loss_history = []\n", + " self.agg_model_loss_history = []\n", + " self.local_model_loss_history = []\n", + " # Dictionary to store optimizer states for each collaborator\n", + " self.optimizer_states = {}\n", + " \n", + "\n", + " @aggregator\n", + " def start(self):\n", + " \"\"\"\n", + " Initialize the model and set up the collaborators for federated learning.\n", + "\n", + " This method performs the initial setup for the model, including setting the\n", + " collaborators, initializing private variables, and starting the first round\n", + " of the federated learning process.\n", + " \"\"\"\n", + " print(f\"Performing initialization for model with {self.quant_type} quantization\")\n", + " print(f\"Using {self.rounds} main rounds with partial round updates\")\n", + " self.collaborators = self.runtime.collaborators\n", + " self.current_round = 0\n", + " self.current_sub_round = 0\n", + " # Initialize dictionary to collect memory stats\n", + " # Check if collaborators are objects with name attribute or strings\n", + " if hasattr(self.collaborators[0], 'name'):\n", + " collab_names = [c.name for c in self.collaborators]\n", + " else:\n", + " # If collaborators are already strings, use them directly\n", + " collab_names = self.collaborators\n", + " self.all_memory_stats = {collab: {} for collab in collab_names}\n", + " # Initialize optimizer states dictionary for each collaborator\n", + " self.optimizer_states = {collab: None for collab in collab_names}\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " )\n", + "\n", + " \n", + " @collaborator\n", + " def aggregated_model_validation(self):\n", + " \"\"\"\n", + " Perform aggregated model validation for a collaborator.\n", + "\n", + " This method loads the model, applies the PEFT configuration, and evaluates\n", + " the model using the provided training and evaluation datasets. The validation\n", + " score is then stored and the next step in the process is triggered.\n", + " \"\"\"\n", + " print(f\"[Round {self.current_round}, Update {self.current_sub_round}] Performing aggregated model validation for collaborator {self.input} with {self.quant_type}\")\n", + " # Initialize memory tracker for this collaborator\n", + " self.memory_tracker = MemoryTracker(self.input, self.quant_type)\n", + " self.memory_tracker.reset_peak()\n", + " \n", + " # Choose quantization config based on quant_type\n", + " if self.quant_type == \"4bit\":\n", + " quant_config = bnb_config_4bit\n", + " else: # 8bit\n", + " quant_config = bnb_config_8bit\n", + " \n", + " # Define device_map variable\n", + " device_map = {\"\": torch.cuda.current_device()} if torch.cuda.is_available() else \"cpu\"\n", + " try:\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=quant_config,\n", + " device_map=device_map,\n", + " trust_remote_code=True\n", + " )\n", + " self.memory_tracker.log(\"model_load\")\n", + " except ValueError:\n", + " # Fallback to CPU if GPU memory is insufficient\n", + " print(f\"Falling back to CPU mode for {self.input}\")\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " device_map=\"cpu\",\n", + " trust_remote_code=True\n", + " )\n", + " self.memory_tracker.log(\"model_load\")\n", + " \n", + " self.model = prepare_model_for_kbit_training(self.model)\n", + " self.model = get_peft_model(self.model, peft_config)\n", + " set_peft_model_state_dict(self.model, self.peft_params)\n", + " \n", + " # Use fixed number of steps (max_steps) for each round\n", + " steps_per_round = training_args.max_steps # Use the hardcoded 100 steps\n", + " \n", + " # Create a custom TrainingArguments for this round\n", + " self.round_args = TrainingArguments(\n", + " output_dir=training_args.output_dir,\n", + " max_steps=steps_per_round, # Use fixed steps per round\n", + " per_device_train_batch_size=training_args.per_device_train_batch_size,\n", + " gradient_accumulation_steps=training_args.gradient_accumulation_steps,\n", + " optim=training_args.optim,\n", + " save_steps=steps_per_round // 2 or 1, # More frequent saving\n", + " logging_steps=5,\n", + " learning_rate=training_args.learning_rate,\n", + " weight_decay=training_args.weight_decay,\n", + " fp16=training_args.fp16,\n", + " bf16=training_args.bf16,\n", + " max_grad_norm=training_args.max_grad_norm,\n", + " warmup_ratio=training_args.warmup_ratio,\n", + " lr_scheduler_type=training_args.lr_scheduler_type,\n", + " gradient_checkpointing=training_args.gradient_checkpointing,\n", + " report_to=training_args.report_to\n", + " )\n", + " \n", + " print(f\"[{self.input}] Training with {steps_per_round} steps\")\n", + " \n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=self.round_args, # Use round specific args\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + "\n", + " trainer.remove_callback(PrinterCallback)\n", + " out = trainer.evaluate()\n", + " self.agg_validation_score = out[\"eval_loss\"]\n", + " print(f\"{self.input} evaluation loss: {self.agg_validation_score}\")\n", + " self.memory_tracker.log_loss(eval_loss=self.agg_validation_score) # Log eval loss\n", + " self.memory_tracker.update_peak()\n", + " self.next(self.train)\n", + "\n", + " @collaborator\n", + " def train(self):\n", + " \"\"\"\n", + " Train the model for a collaborator with partial epoch updates.\n", + "\n", + " This method trains the model using the provided training dataset,\n", + " but processes it in smaller chunks (partial epochs) to allow more\n", + " frequent parameter sharing between collaborators.\n", + " \"\"\"\n", + " self.memory_tracker.log(\"before_training\")\n", + " \n", + " # Reduce steps for 8-bit quantization\n", + " if self.quant_type == \"8bit\":\n", + " max_steps = training_args.max_steps // 2 # Half the steps for 8-bit\n", + " else:\n", + " max_steps = training_args.max_steps\n", + " \n", + " # Define partial training args\n", + " self.sub_round_args = TrainingArguments(\n", + " output_dir=training_args.output_dir,\n", + " max_steps=max_steps,\n", + " per_device_train_batch_size=training_args.per_device_train_batch_size,\n", + " gradient_accumulation_steps=training_args.gradient_accumulation_steps,\n", + " optim=training_args.optim,\n", + " save_steps=max_steps // 4,\n", + " logging_steps=2,\n", + " learning_rate=training_args.learning_rate,\n", + " weight_decay=training_args.weight_decay,\n", + " fp16=training_args.fp16,\n", + " bf16=training_args.bf16,\n", + " max_grad_norm=training_args.max_grad_norm,\n", + " warmup_ratio=training_args.warmup_ratio,\n", + " lr_scheduler_type=training_args.lr_scheduler_type,\n", + " gradient_checkpointing=training_args.gradient_checkpointing,\n", + " report_to=training_args.report_to,\n", + " auto_find_batch_size=True # Add auto batch size finding\n", + " )\n", + " \n", + " # Create trainer instance with our custom training args\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=self.sub_round_args,\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=512, # Reduced sequence length\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " )\n", + " \n", + " # Make sure optimizer is initialized before training\n", + " if not hasattr(trainer, 'optimizer') or trainer.optimizer is None:\n", + " trainer.create_optimizer_and_scheduler(num_training_steps=self.sub_round_args.max_steps)\n", + " if trainer.optimizer is None:\n", + " print(f\"[{self.input}] Warning: Failed to create optimizer. Creating standard optimizer.\")\n", + " # Create a simple optimizer if trainer.create_optimizer_and_scheduler failed\n", + " from torch.optim import AdamW\n", + " trainer.optimizer = AdamW(\n", + " trainer.model.parameters(),\n", + " lr=self.sub_round_args.learning_rate,\n", + " weight_decay=self.sub_round_args.weight_decay\n", + " )\n", + " # Create a simple scheduler\n", + " from transformers import get_scheduler\n", + " trainer.lr_scheduler = get_scheduler(\n", + " name=self.sub_round_args.lr_scheduler_type,\n", + " optimizer=trainer.optimizer,\n", + " num_warmup_steps=int(self.sub_round_args.max_steps * self.sub_round_args.warmup_ratio),\n", + " num_training_steps=self.sub_round_args.max_steps,\n", + " )\n", + " \n", + " # Restore optimizer state if available from previous rounds\n", + " if self.optimizer_states.get(self.input) is not None:\n", + " print(f\"[{self.input}] Restoring optimizer state\")\n", + " try:\n", + " # Load the optimizer state\n", + " trainer.optimizer.load_state_dict(self.optimizer_states[self.input])\n", + " except Exception as e:\n", + " print(f\"Failed to restore optimizer state: {e}\")\n", + " \n", + " # For 8-bit quantization with limited GPU memory, use simplified training\n", + " if self.quant_type == \"8bit\":\n", + " # Simplify training for 8-bit\n", + " try:\n", + " # Use trainer.train() for simpler training flow\n", + " trainer.train(resume_from_checkpoint=False)\n", + " # Get the last loss\n", + " self.loss = trainer.state.log_history[-1].get('loss', float('inf'))\n", + " except Exception as e:\n", + " print(f\"Training failed with error: {str(e)}\")\n", + " self.loss = float('inf') # Set to infinity to indicate failure\n", + " else:\n", + " # Regular training with manual control for 4-bit\n", + " print(f\"[{self.input}] Starting partial epoch training with {max_steps} steps\")\n", + " trainer.model.train()\n", + " total_loss = 0\n", + " step_count = 0\n", + " \n", + " # Set up dataloader for manual batching\n", + " dataloader = trainer.get_train_dataloader()\n", + " \n", + " # Process batches manually for more control\n", + " for step, inputs in enumerate(dataloader):\n", + " # Move inputs to the appropriate device\n", + " inputs = {k: v.to(trainer.args.device) for k, v in inputs.items()}\n", + " \n", + " # Forward pass\n", + " outputs = trainer.model(**inputs)\n", + " \n", + " # Handle different output formats\n", + " if isinstance(outputs, dict):\n", + " if \"loss\" in outputs:\n", + " loss = outputs[\"loss\"] / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " # Calculate loss manually if not provided in outputs\n", + " # Get logits from outputs\n", + " if \"logits\" in outputs:\n", + " logits = outputs[\"logits\"]\n", + " # Get labels from inputs\n", + " labels = inputs.get(\"labels\")\n", + " if labels is not None:\n", + " # Calculate loss using cross-entropy\n", + " import torch.nn.functional as F\n", + " # Shift logits and labels for causal LM\n", + " shift_logits = logits[..., :-1, :].contiguous()\n", + " shift_labels = labels[..., 1:].contiguous()\n", + " loss = F.cross_entropy(\n", + " shift_logits.view(-1, shift_logits.size(-1)),\n", + " shift_labels.view(-1),\n", + " ignore_index=-100\n", + " ) / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " print(f\"Warning: No labels in inputs, using dummy loss\")\n", + " loss = (outputs[\"logits\"].sum() * 0.0) / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " print(f\"Warning: No logits or loss in outputs, using dummy loss\")\n", + " # Use any tensor from outputs for a dummy loss\n", + " dummy_tensor = next(iter(outputs.values()))\n", + " loss = (dummy_tensor.sum() * 0.0) / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " loss = outputs.loss / trainer.args.gradient_accumulation_steps\n", + " \n", + " total_loss += loss.detach().float()\n", + " \n", + " # Backward pass\n", + " loss.backward()\n", + " \n", + " # Update weights on gradient accumulation steps or at the end\n", + " if ((step + 1) % trainer.args.gradient_accumulation_steps == 0) or (step == len(dataloader) - 1):\n", + " # Double check optimizer exists before using it\n", + " if trainer.optimizer is None:\n", + " print(f\"[{self.input}] Warning: Optimizer is None at step {step}. Creating optimizer.\")\n", + " # Create a simple optimizer\n", + " from torch.optim import AdamW\n", + " trainer.optimizer = AdamW(\n", + " trainer.model.parameters(), \n", + " lr=self.sub_round_args.learning_rate,\n", + " weight_decay=self.sub_round_args.weight_decay\n", + " )\n", + " # Create a simple scheduler\n", + " from transformers import get_scheduler\n", + " trainer.lr_scheduler = get_scheduler(\n", + " name=self.sub_round_args.lr_scheduler_type,\n", + " optimizer=trainer.optimizer,\n", + " num_warmup_steps=int(self.sub_round_args.max_steps * self.sub_round_args.warmup_ratio),\n", + " num_training_steps=self.sub_round_args.max_steps,\n", + " )\n", + " \n", + " trainer.optimizer.step()\n", + " trainer.lr_scheduler.step()\n", + " trainer.optimizer.zero_grad()\n", + " step_count += 1\n", + " \n", + " # Log progress\n", + " if step_count > 0 and step_count % 10 == 0:\n", + " print(f\"[{self.input}] Completed {step_count} steps, current loss: {total_loss/step_count:.4f}\")\n", + " \n", + " # Stop after max_steps\n", + " if step_count >= max_steps:\n", + " break\n", + " \n", + " # Calculate final training loss\n", + " self.loss = total_loss / step_count if step_count > 0 else 0\n", + " \n", + " print(f\"[{self.input}] Training completed, average loss: {self.loss:.4f}\")\n", + " \n", + " # Log memory and training metrics\n", + " self.memory_tracker.log(\"after_training\")\n", + " self.memory_tracker.log_loss(training_loss=self.loss)\n", + " self.memory_tracker.update_peak()\n", + " \n", + " # Save optimizer state for next round\n", + " if hasattr(trainer, 'optimizer') and trainer.optimizer is not None:\n", + " self.optimizer_states[self.input] = trainer.optimizer.state_dict()\n", + " # Create directory for saving optimizer state if needed\n", + " os.makedirs(f\"./optimizer_state/{self.input}\", exist_ok=True)\n", + " # Save optimizer state to disk as backup\n", + " torch.save(\n", + " trainer.optimizer.state_dict(), \n", + " f\"./optimizer_state/{self.input}/optimizer_round_{self.current_round}_update_{self.current_sub_round}.pt\"\n", + " )\n", + " \n", + " # Save model checkpoint\n", + " trainer.save_model(f\"./local_models/{self.input}/round_{self.current_round}_update_{self.current_sub_round}\")\n", + " self.training_completed = True\n", + " self.next(self.local_model_validation)\n", + "\n", + " @collaborator\n", + " def local_model_validation(self):\n", + " \"\"\"\n", + " Perform local model validation for a collaborator.\n", + "\n", + " This method evaluates the model using the provided training and evaluation datasets.\n", + " The validation score is stored, the PEFT parameters are updated, and the next step\n", + " in the process is triggered.\n", + " \"\"\"\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=self.sub_round_args, # Use sub-round specific args\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + " out = trainer.evaluate()\n", + " self.local_validation_score = out[\"eval_loss\"]\n", + " print(f\"[{self.input}] Local evaluation loss: {self.local_validation_score}\")\n", + " self.memory_tracker.log_loss(eval_loss=self.local_validation_score) # Log eval loss\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " print(f\"Doing local model validation for collaborator {self.input}\")\n", + " \n", + " # Display memory report for this collaborator\n", + " self.memory_tracker.report()\n", + " self.memory_stats = self.memory_tracker.get_stats()\n", + " self.next(self.join, exclude=[\"training_completed\", \"model\", \"memory_tracker\"])\n", + "\n", + " @aggregator\n", + " def join(self, inputs):\n", + " \"\"\"\n", + " Aggregate the results from all collaborators and update the model.\n", + "\n", + " This method calculates the average loss, aggregated model accuracy, and local model\n", + " accuracy from all collaborators. The model parameters are updated using Federated\n", + " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n", + " \"\"\"\n", + " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", + " self.aggregated_model_accuracy = sum(\n", + " input.agg_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " self.local_model_accuracy = sum(\n", + " input.local_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " print(\n", + " f\"[Round {self.current_round}, Update {self.current_sub_round}] Average aggregated model validation loss = {self.aggregated_model_accuracy}\"\n", + " )\n", + " print(f\"[Round {self.current_round}, Update {self.current_sub_round}] Average training loss = {self.average_loss}\")\n", + " print(f\"[Round {self.current_round}, Update {self.current_sub_round}] Average local model validation loss = {self.local_model_accuracy}\")\n", + "\n", + " # Store metrics in history for plotting trends\n", + " self.average_loss_history.append(self.average_loss)\n", + " self.agg_model_loss_history.append(self.aggregated_model_accuracy)\n", + " self.local_model_loss_history.append(self.local_model_accuracy)\n", + " \n", + " # Collect memory stats from all collaborators for this round\n", + " for input_data in inputs:\n", + " round_key = f\"round_{self.current_round}_update_{self.current_sub_round}\"\n", + " self.all_memory_stats[input_data.input][round_key] = input_data.memory_stats\n", + " # Update optimizer states from collaborators\n", + " if hasattr(input_data, 'optimizer_states') and input_data.optimizer_states.get(input_data.input) is not None:\n", + " self.optimizer_states[input_data.input] = input_data.optimizer_states[input_data.input]\n", + "\n", + " # Save aggregated optimizer states for debug/analysis\n", + " os.makedirs(\"./optimizer_state/aggregator\", exist_ok=True)\n", + " torch.save(\n", + " self.optimizer_states, \n", + " f\"./optimizer_state/aggregator/optimizers_round_{self.current_round}_update_{self.current_sub_round}.pt\"\n", + " )\n", + " \n", + " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + "\n", + " # Save aggregated model after each sub-round update\n", + " save_dir = f\"./aggregated/model_round_{self.current_round}_update_{self.current_sub_round}\"\n", + " os.makedirs(save_dir, exist_ok=True)\n", + " self.model.save_pretrained(save_dir)\n", + " tokenizer.save_pretrained(f\"./aggregated/tokenizer_round_{self.current_round}_update_{self.current_sub_round}\")\n", + " \n", + " # Update round and sub-round counters\n", + " self.current_sub_round += 1\n", + " # Each update is treated as a partial update within a full round\n", + " # Update main round counter and reset sub-round when appropriate\n", + " if self.current_sub_round >= 2: # Default to 2 partial updates per round\n", + " self.current_sub_round = 0\n", + " self.current_round += 1\n", + " \n", + " if self.current_round < self.rounds:\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " exclude=[\"model\"],\n", + " )\n", + " else:\n", + " self.next(self.end)\n", + "\n", + " @aggregator\n", + " def end(self):\n", + " \"\"\"\n", + " End the federated learning process.\n", + "\n", + " This method marks the end of the federated learning process and performs any\n", + " necessary cleanup or finalization steps.\n", + " \"\"\"\n", + " print(f\"This is the end of the flow for {self.quant_type} quantization\")\n", + " print(\"\\n===== Final Metrics =====\\n\")\n", + " print(f\"Average Training Loss: {self.average_loss_history[-1]:.4f}\")\n", + " print(f\"Final Aggregated Model Loss: {self.agg_model_loss_history[-1]:.4f}\")\n", + " print(f\"Final Local Model Loss: {self.local_model_loss_history[-1]:.4f}\")\n", + " \n", + " print(\"\\n===== Metric History =====\\n\")\n", + " print(\"Training Loss History:\")\n", + " for i, loss in enumerate(self.average_loss_history):\n", + " print(f\" Update {i}: {loss:.4f}\")\n", + " \n", + " print(\"\\nAggregated Model Loss History:\")\n", + " for i, loss in enumerate(self.agg_model_loss_history):\n", + " print(f\" Update {i}: {loss:.4f}\")\n", + " \n", + " print(\"\\nLocal Model Loss History:\")\n", + " for i, loss in enumerate(self.local_model_loss_history):\n", + " print(f\" Update {i}: {loss:.4f}\")\n", + " \n", + " print(\"\\n===== Memory Usage Summary Across All Rounds =====\\n\")\n", + " \n", + " # Print aggregated memory statistics\n", + " for collab, rounds_data in self.all_memory_stats.items():\n", + " print(f\"\\n==== {collab} Memory Usage Across Rounds/Updates ({self.quant_type}) ====\\n\")\n", + " for round_name, stats in rounds_data.items():\n", + " print(f\" {round_name}:\")\n", + " for metric, value in stats.items():\n", + " if value is not None:\n", + " if metric in ['training_loss', 'eval_loss', 'quant_type']:\n", + " if metric != 'quant_type':\n", + " print(f\" {metric}: {value:.4f}\")\n", + " else:\n", + " print(f\" {metric}: {value:.2f} MB\")\n", + " else:\n", + " print(f\" {metric}: Not recorded\")\n", + " print(\"-\" * 50)" + ] + }, + { + "cell_type": "markdown", + "id": "7bc8fe27", + "metadata": {}, + "source": [ + "## Run Federated Learning with 4-bit Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=============== Running with 4-bit Quantization ===============\n", + "\n", + "\n", + "Calling start\n", + "\u001b[94mPerforming initialization for model with 4bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mUsing 5 main rounds with partial round updates\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 622.96 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 103 examples [00:00, 627.49 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5811071991920471\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Map: 100%|##########| 4464/4464 [00:01<00:00, 3487.12 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3341.29 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5313\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5340\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5390\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5433\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5464\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5491\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5525\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4287\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4316\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4328\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4357\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4377\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4393\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4421\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4445\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3906\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3917\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3931\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3945\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3967\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3976\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3993\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.4006\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3724\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3738\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3773\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3783\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3791\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 618.15 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 103 examples [00:00, 632.61 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.3916991055011749\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 30342.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 41548.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 43451.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 32388.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57268.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3917\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 631.44 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 101 examples [00:00, 625.21 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5805659294128418\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Map: 100%|##########| 4463/4463 [00:01<00:00, 3422.90 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3404.21 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5439\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5469\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5501\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5523\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5550\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5580\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5638\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5660\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4307\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4347\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4358\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4375\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4432\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4451\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4487\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.3982\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.3988\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.3996\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4012\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4025\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4033\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4041\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3717\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3741\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3749\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3755\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3769\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 639.46 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.3990606665611267\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 19625.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 30754.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 21639.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 47114.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3991\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 0, Update 0] Average aggregated model validation loss = 0.5808365643024445\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average training loss = 0.35865288972854614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average local model validation loss = 0.3953798860311508\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 650.05 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.38848692178726196\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3328.62 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2647\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2668\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2703\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2761\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2782\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2813\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2647\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2676\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2687\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2708\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2726\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2740\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2758\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2780\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2695\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2704\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2717\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2729\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2756\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2772\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2783\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2724\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2732\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2745\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2762\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2770\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 103 examples [00:00, 645.41 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.3903903663158417\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31586.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57670.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3904\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.3931271433830261\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3406.01 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2785\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2815\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2831\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2853\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2878\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2919\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2938\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2690\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2709\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2725\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2749\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2798\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2815\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2844\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2730\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2736\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2743\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2757\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2770\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2785\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2795\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2696\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2701\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2707\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2717\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2725\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2730\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2740\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 101 examples [00:00, 653.25 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.3981446623802185\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57810.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3981\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 0, Update 1] Average aggregated model validation loss = 0.39080703258514404\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average training loss = 0.2652348279953003\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average local model validation loss = 0.3942675143480301\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.38319262862205505\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.2195\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2213\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2243\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2270\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2291\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2310\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2341\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2371\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2230\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2253\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2263\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2280\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2298\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2311\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2324\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2346\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2331\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2338\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2351\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2361\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2378\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2385\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2398\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2408\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2375\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2379\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2386\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2398\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2404\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2413\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2421\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2428\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.392806738615036\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31766.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58230.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3928\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.3884606957435608\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2279\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2298\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2345\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2363\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2382\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2412\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2428\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2262\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2280\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2295\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2305\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2317\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2359\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2375\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2394\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2341\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2347\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2354\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2367\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2378\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2386\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2392\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2402\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2352\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2356\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2361\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2369\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2376\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2381\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2390\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2397\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4016312062740326\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31784.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58110.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4016\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 0] Average aggregated model validation loss = 0.3858266621828079\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average training loss = 0.23103156685829163\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average local model validation loss = 0.3972189724445343\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.38549479842185974\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.1714\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1729\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1755\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1777\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1793\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1809\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1836\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1854\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1804\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1824\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1834\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1847\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1863\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1876\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1888\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1907\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1979\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1991\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2000\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2014\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2021\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2032\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2043\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2048\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2057\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2068\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2073\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2081\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2088\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2095\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.401896208524704\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58170.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4019\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.3921719491481781\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1773\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1793\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1823\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1835\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1851\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1866\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1905\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1832\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1849\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1863\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1870\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1881\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1913\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1940\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1960\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1966\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1973\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1984\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1993\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2001\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2007\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2017\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1996\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2001\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2005\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2011\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2018\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2023\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2031\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2037\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.413899302482605\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31770.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4139\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 1] Average aggregated model validation loss = 0.3888333737850189\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average training loss = 0.19815459847450256\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average local model validation loss = 0.4078977555036545\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.39401885867118835\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.1237\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1246\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1261\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1270\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1279\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1287\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1301\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1316\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1355\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1373\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1381\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1395\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1409\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1420\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1430\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1444\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1612\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1619\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1628\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1637\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1649\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1656\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1666\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1675\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1733\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1737\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1742\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1751\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1756\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1762\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1768\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1775\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.41375958919525146\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58130.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4138\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.40143540501594543\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1366\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1386\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1411\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1418\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1435\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1452\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1468\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1478\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1464\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1478\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1488\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1494\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1505\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1524\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1537\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1551\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1653\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1658\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1664\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1673\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1681\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1689\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1695\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1704\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1743\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1751\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1757\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1764\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1768\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1775\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1780\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.42323189973831177\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58150.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4232\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 0] Average aggregated model validation loss = 0.3977271318435669\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average training loss = 0.17152628302574158\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average local model validation loss = 0.4184957444667816\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.40234553813934326\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0984\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0989\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0996\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1004\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1011\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1018\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1027\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1035\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.4498145878314972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31762.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4498\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.41048040986061096\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1032\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1041\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1049\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1058\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1065\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1075\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1085\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1092\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1116\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1130\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1140\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1148\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1160\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1172\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1186\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1195\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1353\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1358\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1363\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1370\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1377\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1384\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1390\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1397\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1460\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1463\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1467\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1472\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1478\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1482\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1489\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1493\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.44478899240493774\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58210.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4448\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 1] Average aggregated model validation loss = 0.4064129739999771\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average training loss = 0.14573591947555542\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average local model validation loss = 0.44730179011821747\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.4368877112865448\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0855\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0863\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0871\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0882\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0889\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0895\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0902\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0911\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0842\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0862\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0868\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0874\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0882\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0887\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0894\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1078\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1084\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1092\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1099\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1108\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1113\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1121\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1129\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1229\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1232\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1235\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1241\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1245\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1249\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1254\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1259\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.4723173975944519\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31782.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58090.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.44539061188697815\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0859\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0867\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0880\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0899\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0907\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0936\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0941\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0948\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0954\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0963\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0970\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0981\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0987\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1174\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1178\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1183\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1188\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1197\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1203\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1218\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1301\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1304\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1308\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1312\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1317\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1321\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1326\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1330\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4785761833190918\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58050.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4786\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 3, Update 0] Average aggregated model validation loss = 0.4411391615867615\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average training loss = 0.12434957921504974\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average local model validation loss = 0.47544679045677185\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.47256991267204285\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0721\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0744\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0759\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0768\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0777\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0733\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0750\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0756\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0761\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0771\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0780\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0786\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5270882248878479\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58290.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5271\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.48190006613731384\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0685\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0691\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0700\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0707\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0716\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0744\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0731\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0741\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0750\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0754\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0761\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0772\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0777\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0940\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0945\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0949\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0954\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0960\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0966\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0978\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1058\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1061\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1064\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1068\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1073\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1076\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1082\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1085\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.528550386428833\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58328.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5286\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 3, Update 1] Average aggregated model validation loss = 0.47723498940467834\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average training loss = 0.10317578911781311\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average local model validation loss = 0.5278193056583405\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 30 steps, current loss: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0730\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0739\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0744\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0754\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0759\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0824\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0826\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0828\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0831\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0832\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0835\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0838\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0841\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5606555938720703\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5607\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5261728763580322\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0576\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0584\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0589\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0594\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0602\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0610\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0617\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0622\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0613\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0622\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0627\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0631\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0637\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0642\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0649\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0653\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0770\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0773\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0783\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0788\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0791\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0871\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0873\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0875\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0878\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0882\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0885\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0893\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5728362202644348\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 32006.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58388.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 4, Update 0] Average aggregated model validation loss = 0.5190570056438446\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average training loss = 0.08311298489570618\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average local model validation loss = 0.5667459070682526\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.26it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5445127487182617\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0533\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0536\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0543\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0551\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0557\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0563\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0570\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0577\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0569\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0578\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0583\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0590\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0595\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0600\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0604\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0669\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0672\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0675\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0677\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0682\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0684\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0688\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0694\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0750\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0755\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0757\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0759\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5720435380935669\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31786.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57888.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.562188982963562\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0550\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0556\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0563\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0569\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0575\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0582\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0598\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0602\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0583\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0590\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0594\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0597\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0605\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0610\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0619\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0678\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0680\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0683\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0686\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0689\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0692\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0696\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0699\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0732\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0736\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0738\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0741\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0743\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0746\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.6029046773910522\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58208.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.6029\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 4, Update 1] Average aggregated model validation loss = 0.5533508658409119\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average training loss = 0.07266537845134735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average local model validation loss = 0.5874741077423096\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling end\n", + "\u001b[94mThis is the end of the flow for 4bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Final Metrics =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage Training Loss: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.5534\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Local Model Loss: 0.5875\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Metric History =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mTraining Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.3587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.2652\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.2310\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.1982\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.1715\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.1457\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.1243\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.1032\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.0831\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Aggregated Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.5808\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3908\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.3858\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.3888\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.3977\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.4064\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.4411\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.4772\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5191\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5534\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Local Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.3954\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3943\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.3972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.4079\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.4185\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.4473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.4754\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.5278\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5667\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5875\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Memory Usage Summary Across All Rounds =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Portland Memory Usage Across Rounds/Updates (4bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3917\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 30342.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 41548.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 43451.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 32388.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57268.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3904\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31586.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57670.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3928\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31766.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58230.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4019\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58170.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4138\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58130.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4498\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31762.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31782.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58090.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5271\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58290.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5607\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31786.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57888.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Seattle Memory Usage Across Rounds/Updates (4bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3991\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 19625.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 30754.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 21639.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 47114.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3981\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57810.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4016\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31784.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58110.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4139\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31770.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4232\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58150.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4448\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58210.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4786\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58050.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5286\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58328.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 32006.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58388.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.6029\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58208.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Setup participants\n", + "aggregator = Aggregator()\n", + "collaborators = [\n", + " Collaborator(name=\"Portland\"),\n", + " Collaborator(name=\"Seattle\")\n", + "]\n", + "\n", + "# Assign data shards\n", + "for idx, colab in enumerate(collaborators):\n", + " colab.private_attributes = {\n", + " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n", + " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n", + " }\n", + "\n", + "# Run with 4-bit quantization\n", + "print(\"\\n=============== Running with 4-bit Quantization ===============\\n\")\n", + "bnb_config = bnb_config_4bit # Set active config to 4-bit\n", + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "flflow_4bit = FederatedFlow(model, rounds=5, quant_type=\"4bit\") # Reduce to 1 round\n", + "flflow_4bit.runtime = runtime\n", + "flflow_4bit.run()" + ] + }, + { + "cell_type": "markdown", + "id": "87c4865a", + "metadata": {}, + "source": [ + "## Run Federated Learning with 8-bit Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "93c60404", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleared CUDA cache between runs\n", + "\n", + "=============== Running with 8-bit Quantization ===============\n", + "\n", + "Loading model with 8-bit quantization on CPU first...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00, 2.05it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling start\n", + "\u001b[94mPerforming initialization for model with 8bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mUsing 5 main rounds with partial round updates\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.561655580997467\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.728600
40.559300
60.475600
80.319500
100.310800
120.300300
140.244800
160.375300
180.305800
200.279400
220.359400
240.257100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.3953164219856262\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 36814.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38196.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39008.34 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3953\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.26it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5638197064399719\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:30, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.845500
40.502000
60.451700
80.422500
100.285700
120.259400
140.324700
160.300600
180.309400
200.332400
220.378600
240.347400

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.40347251296043396\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 36974.44 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38276.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39149.93 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42176.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4035\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 0, Update 0] Average aggregated model validation loss = 0.5627376437187195\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average local model validation loss = 0.3993944674730301\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.395594984292984\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [22/25 01:14 < 00:11, 0.27 it/s, Epoch 0.04/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.287200
40.248800
60.295900
80.238400
100.245100
120.248000
140.212000
160.306900
180.248200
200.230800

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.39317232370376587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37772.41 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57164.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39841.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42898.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3932\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 0, Update 1] Average aggregated model validation loss = 0.3971341550350189\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average local model validation loss = 0.39462728798389435\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.3897101879119873\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.237800
40.217600
60.244500
80.208000
100.210300
120.213500
140.190500
160.264600
180.209000
200.189600
220.251900
240.158800

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.40585511922836304\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57214.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39916.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42894.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4059\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.3935319185256958\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.268500
40.193500
60.218900
80.268700
100.190400
120.189700
140.214800
160.211600
180.231400
200.240300
220.246100
240.177200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.40515169501304626\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39949.17 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 0] Average aggregated model validation loss = 0.39162105321884155\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average local model validation loss = 0.40550340712070465\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.3948669135570526\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.184200
40.176300
60.187500
80.161600
100.176600
120.178600
140.169600
160.213200
180.169800
200.152300
220.184800
240.121900

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.4210392236709595\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38794.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39911.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.39981263875961304\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:30, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.211300
40.154100
60.163600
80.212400
100.163400
120.166300
140.176000
160.181400
180.182900
200.197100
220.201300
240.147700

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4217308461666107\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39946.36 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42924.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4217\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 1] Average aggregated model validation loss = 0.3973397761583328\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average local model validation loss = 0.4213850349187851\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.40616321563720703\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.128500
40.116900
60.109700
80.119700
100.154000
120.148100
140.156600
160.162500
180.128900
200.127000
220.123400
240.083500

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5076923370361328\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38848.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39909.20 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5077\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.41194236278533936\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.150600
40.110600
60.114500
80.168300
100.139400
120.133200
140.144500
160.158400
180.144300
200.149400
220.145400
240.108200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4701308310031891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39941.88 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 43064.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4701\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 0] Average aggregated model validation loss = 0.4090527892112732\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average local model validation loss = 0.48891158401966095\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.4745386242866516\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.077900
40.073000
60.103700
80.090100
100.109100
120.110200
140.128100
160.140200
180.123900
200.105000
220.098300
240.070000

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5099506974220276\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38856.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39908.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5100\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.48276180028915405\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.086700
40.066700
60.083300
80.140100
100.135800
120.124500
140.128400
160.135900
180.132500
200.125100
220.109400
240.079100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5275096893310547\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38960.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39946.38 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5275\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 1] Average aggregated model validation loss = 0.47865021228790283\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average local model validation loss = 0.5187301933765411\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5035024285316467\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.049000
40.054900
60.076800
80.087800
100.106500
120.091100
140.111600
160.129900
180.105400
200.090700
220.099800
240.083200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5614020824432373\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39911.37 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42870.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5139071941375732\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:30, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.055200
40.049700
60.085100
80.116400
100.106000
120.111700
140.102500
160.114700
180.106000
200.107500
220.085900
240.080100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5329421162605286\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39942.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42944.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 3, Update 0] Average aggregated model validation loss = 0.50870481133461\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average local model validation loss = 0.5471720993518829\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5240783095359802\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.041300
40.044000
60.060200
80.074900
100.080700
120.078300
140.096000
160.111400
180.093000
200.079100
220.084700
240.066400

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5784252882003784\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39910.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42950.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5784\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5348654389381409\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.047700
40.042700
60.064300
80.092000
100.089600
120.105100
140.101900
160.093900
180.090200
200.093600
220.081300
240.066200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5685837864875793\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38936.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39946.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5686\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 3, Update 1] Average aggregated model validation loss = 0.5294718742370605\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average local model validation loss = 0.5735045373439789\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5471141934394836\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.033400
40.040500
60.064200
80.065800
100.077300
120.071600
140.080400
160.088800
180.082500
200.062700
220.066100
240.056600

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5927156209945679\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39912.48 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42850.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5571405291557312\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.035800
40.037800
60.059100
80.089500
100.071900
120.084300
140.088000
160.095700
180.074300
200.082400
220.071100
240.058900

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5642604827880859\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39944.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 43022.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5643\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 4, Update 0] Average aggregated model validation loss = 0.5521273612976074\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average local model validation loss = 0.5784880518913269\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5503019094467163\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.033700
40.033800
60.052200
80.071900
100.063400
120.057900
140.065800
160.068000
180.085900
200.057100
220.056000
240.051500

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5799501538276672\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39911.18 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5800\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5621325373649597\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.038600
40.032600
60.056700
80.066900
100.062500
120.060700
140.078300
160.075200
180.063500
200.071600
220.064000
240.045200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5817354321479797\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39944.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 43002.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5817\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 4, Update 1] Average aggregated model validation loss = 0.556217223405838\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average local model validation loss = 0.5808427929878235\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling end\n", + "\u001b[94mThis is the end of the flow for 8bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Final Metrics =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.5562\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Local Model Loss: 0.5808\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Metric History =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mTraining Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Aggregated Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.5627\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3971\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.3916\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.3973\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.4091\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.4787\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.5087\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.5295\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5521\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5562\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Local Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.3994\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3946\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.4055\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.4214\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.4889\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.5187\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.5472\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.5735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5785\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5808\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Memory Usage Summary Across All Rounds =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Portland Memory Usage Across Rounds/Updates (8bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3953\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 36814.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38196.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39008.34 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40100.06 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60872.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57164.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3961\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37719.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38654.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 56518.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40100.06 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60872.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57164.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39808.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42766.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57164.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4059\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57214.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39916.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42894.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38794.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39911.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5077\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38848.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39909.20 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5100\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38856.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39908.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39911.37 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42870.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5784\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39910.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42950.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39912.48 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42850.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5800\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39911.18 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Seattle Memory Usage Across Rounds/Updates (8bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4035\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 36974.44 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38276.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39149.93 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42176.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3932\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37772.41 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57164.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39841.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42898.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39949.17 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4217\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39946.36 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42924.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4701\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39941.88 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 43064.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5275\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38960.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39946.38 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39942.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42944.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5686\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38936.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39946.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5643\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39944.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 43022.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5817\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39944.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 43002.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Clean up CUDA cache between runs\n", + "if torch.cuda.is_available():\n", + " torch.cuda.empty_cache()\n", + " print(\"Cleared CUDA cache between runs\")\n", + "\n", + "import gc\n", + "import time\n", + "\n", + "# Force garbage collection\n", + "gc.collect()\n", + "time.sleep(5) # Give system time to free memory\n", + "\n", + "# Run with 8-bit quantization\n", + "print(\"\\n=============== Running with 8-bit Quantization ===============\\n\")\n", + "bnb_config = bnb_config_8bit # Set active config to 8-bit\n", + "\n", + "# Force model to be loaded on CPU first for 8-bit quantization\n", + "print(\"Loading model with 8-bit quantization on CPU first...\")\n", + "model_8bit = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " device_map=\"cpu\", # Start on CPU to avoid OOM\n", + " quantization_config=bnb_config_8bit,\n", + " trust_remote_code=True,\n", + " torch_dtype=torch.float32, # Use float32 for CPU\n", + " low_cpu_mem_usage=True\n", + ")\n", + "model_8bit = prepare_model_for_kbit_training(model_8bit)\n", + "model_8bit = get_peft_model(model_8bit, peft_config)\n", + "\n", + "# Use only one round and one collaborator for 8-bit to save memory\n", + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "flflow_8bit = FederatedFlow(model_8bit, rounds=5, quant_type=\"8bit\")\n", + "flflow_8bit.runtime = runtime\n", + "flflow_8bit.run()" + ] + }, + { + "cell_type": "markdown", + "id": "ebe541a4", + "metadata": {}, + "source": [ + "## Visualize Memory and Performance Metrics" + ] + }, + { + "cell_type": "markdown", + "id": "4718aa9f", + "metadata": {}, + "source": [ + "Now that we've run our federated training with both 4-bit and 8-bit quantization, let's visualize the memory usage and performance metrics to understand the tradeoffs between these approaches." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d84d3daa-7520-4b3f-a1d6-ae7cebec58e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Converting any CUDA tensors to CPU for visualization...\n", + "Conversion complete. Ready for visualization.\n" + ] + } + ], + "source": [ + "# Pre-process tensors to fix CUDA to CPU conversion issues\n", + "def tensor_to_float(val):\n", + " if val is None:\n", + " return None\n", + " if isinstance(val, torch.Tensor):\n", + " return val.detach().cpu().float().numpy().item()\n", + " return val\n", + "\n", + "# Convert all tensors in both flow objects\n", + "print(\"Converting any CUDA tensors to CPU for visualization...\")\n", + "\n", + "# Convert the history lists\n", + "flflow_4bit.average_loss_history = [tensor_to_float(x) for x in flflow_4bit.average_loss_history]\n", + "flflow_4bit.agg_model_loss_history = [tensor_to_float(x) for x in flflow_4bit.agg_model_loss_history]\n", + "flflow_4bit.local_model_loss_history = [tensor_to_float(x) for x in flflow_4bit.local_model_loss_history]\n", + "\n", + "flflow_8bit.average_loss_history = [tensor_to_float(x) for x in flflow_8bit.average_loss_history]\n", + "flflow_8bit.agg_model_loss_history = [tensor_to_float(x) for x in flflow_8bit.agg_model_loss_history]\n", + "flflow_8bit.local_model_loss_history = [tensor_to_float(x) for x in flflow_8bit.local_model_loss_history]\n", + "\n", + "# Convert current values\n", + "flflow_4bit.average_loss = tensor_to_float(flflow_4bit.average_loss)\n", + "flflow_4bit.aggregated_model_accuracy = tensor_to_float(flflow_4bit.aggregated_model_accuracy)\n", + "flflow_4bit.local_model_accuracy = tensor_to_float(flflow_4bit.local_model_accuracy)\n", + "\n", + "flflow_8bit.average_loss = tensor_to_float(flflow_8bit.average_loss)\n", + "flflow_8bit.aggregated_model_accuracy = tensor_to_float(flflow_8bit.aggregated_model_accuracy)\n", + "flflow_8bit.local_model_accuracy = tensor_to_float(flflow_8bit.local_model_accuracy)\n", + "\n", + "# Convert tensors in memory stats\n", + "for collab, rounds_data in flflow_4bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " if \"training_loss\" in stats and isinstance(stats[\"training_loss\"], torch.Tensor):\n", + " stats[\"training_loss\"] = tensor_to_float(stats[\"training_loss\"])\n", + " if \"eval_loss\" in stats and isinstance(stats[\"eval_loss\"], torch.Tensor):\n", + " stats[\"eval_loss\"] = tensor_to_float(stats[\"eval_loss\"])\n", + "\n", + "for collab, rounds_data in flflow_8bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " if \"training_loss\" in stats and isinstance(stats[\"training_loss\"], torch.Tensor):\n", + " stats[\"training_loss\"] = tensor_to_float(stats[\"training_loss\"])\n", + " if \"eval_loss\" in stats and isinstance(stats[\"eval_loss\"], torch.Tensor):\n", + " stats[\"eval_loss\"] = tensor_to_float(stats[\"eval_loss\"])\n", + "\n", + "print(\"Conversion complete. Ready for visualization.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e6c8db6d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== Performance Summary ====\n", + "\n", + "Memory Usage Comparison:\n", + " 4-bit Avg: 55770.34 MB\n", + " 8-bit Avg: 57204.05 MB\n", + " Difference: 2.6% more memory with 8-bit\n", + "\n", + "Evaluation Loss Comparison:\n", + " 4-bit Avg: 0.4618\n", + " 8-bit Avg: 0.4909\n", + " Difference: 6.3% higher loss with 8-bit\n", + "\n", + "Efficiency Analysis: 8-bit provides more efficiency memory usage relative to loss\n" + ] + } + ], + "source": [ + "# Visualize memory usage across quantization methods\n", + "plot_memory_metrics(flflow_4bit, flflow_8bit)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "63f3d4e1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== Loss Comparison: 4-bit vs 8-bit ====\n", + "\n", + "Training Loss (4-bit): 0.1754 ± 0.0877\n", + "Training Loss (8-bit): inf ± nan\n", + "\n", + "Eval Loss (4-bit): 0.4618 ± 0.0725\n", + "Eval Loss (8-bit): 0.4909 ± 0.0776\n" + ] + } + ], + "source": [ + "# Visualize training and validation loss across quantization methods\n", + "plot_loss_metrics(flflow_4bit, flflow_8bit)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "c3be3b11", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== Percentage Difference (8-bit vs 4-bit) ====\n", + "\n", + "Avg Training Loss: 8-bit is inf% higher than 4-bit\n", + "Agg Model Loss: 8-bit is 0.52% higher than 4-bit\n", + "Local Model Loss: 8-bit is 1.13% lower than 4-bit\n" + ] + } + ], + "source": [ + "# Visualize aggregated metrics (memory vs. performance tradeoff)\n", + "plot_aggregated_metrics(flflow_4bit, flflow_8bit)" + ] + }, + { + "cell_type": "markdown", + "id": "9a4de940", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "This notebook has demonstrated how to implement federated fine-tuning of Microsoft's Phi-4 model using OpenFL with both 4-bit and 8-bit quantization approaches. The visualization and analysis above help us understand the tradeoffs between these quantization methods:\n", + "\n", + "### Memory Usage and Performance Comparison\n", + "\n", + "- **Memory Footprint**: 4-bit quantization used approximately 2.6% less memory (55,770 MB vs 57,204 MB) compared to 8-bit quantization.\n", + "\n", + "- **Model Quality**: 4-bit quantization achieved better loss metrics overall:\n", + " - Training Loss: 0.1754 ± 0.0877 for 4-bit vs. significantly higher for 8-bit\n", + " - Evaluation Loss: 0.4618 ± 0.0725 for 4-bit vs. 0.4909 ± 0.0776 for 8-bit (6.3% higher)\n", + "\n", + "- **Performance Difference by Metric**:\n", + " - Average Training Loss: 8-bit significantly underperformed compared to 4-bit\n", + " - Aggregated Model Loss: 8-bit was 0.52% higher (worse) than 4-bit\n", + " - Local Model Loss: 8-bit was 1.13% lower (better) than 4-bit\n", + "\n", + "### Key Insights\n", + "\n", + "1. **Memory-Performance Tradeoff**: While 8-bit quantization required slightly more memory, the relative performance differences in evaluation metrics were more significant, suggesting 4-bit quantization offers a better memory-performance balance for this model and task.\n", + "\n", + "2. **Training Stability**: The 4-bit quantization approach demonstrated more stable and better training performance compared to 8-bit quantization.\n", + "\n", + "3. **Efficiency Considerations**: Despite the memory analysis suggesting 8-bit provides more efficient memory usage relative to loss in some metrics, the overall performance profile favors 4-bit quantization for practical federated learning deployments.\n", + "\n", + "By combining federated learning with appropriate quantization techniques, we can successfully fine-tune large language models while balancing computational resource constraints across federated devices. For this Phi-4 model, the 4-bit quantization approach appears to offer the better balance of memory efficiency and model performance." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (myenv)", + "language": "python", + "name": "myenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb deleted file mode 100644 index 0c6884f384..0000000000 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb +++ /dev/null @@ -1,705 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a59f475d-d843-46bc-b75e-10984b687ed3", - "metadata": {}, - "source": [ - "# Federated Fine-Tuning of Phi-4 Using OpenFL" - ] - }, - { - "cell_type": "markdown", - "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", - "metadata": {}, - "source": [ - "\n", - "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow.\n", - "\n", - "We will fine-tune **Microsoft's [Phi4](https://huggingface.co/microsoft/phi-4)** model using a diverse dataset such as [Math_10k](https://github.com/AGI-Edgerunners/LLM-Adapters/tree/main), an open-source dataset containing mathematical question-answer pairs collected from various smaller math datasets." - ] - }, - { - "cell_type": "markdown", - "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", - "metadata": {}, - "source": [ - "## The Workflow Interface" - ] - }, - { - "cell_type": "markdown", - "id": "e3d74610-e48d-4dd4-b622-eb910fbe91aa", - "metadata": {}, - "source": [ - "The workflow interface is an innovative approach to designing federated learning experiments with OpenFL. It was developed in response to discussions with researchers and users who had unique use cases that didn’t perfectly align with the traditional horizontal federated learning model. This interface enables more flexible compositions of experiments, allowing for greater customization and adaptability in complex, real-world scenarios" - ] - }, - { - "cell_type": "markdown", - "id": "413e1d95-fd76-4fe0-b8d0-4c625c2a8fd3", - "metadata": {}, - "source": [ - "## Installing OpenFL\n", - "To install OpenFL, follow the official documentation: \n", - "[OpenFL Installation Guide](https://openfl.readthedocs.io/en/latest/installation.html)" - ] - }, - { - "cell_type": "markdown", - "id": "53654c70", - "metadata": {}, - "source": [ - "After installation, activate experimental APIs using: \n", - "`fx experimental activate`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", - "metadata": {}, - "outputs": [], - "source": [ - "# Install dependencies \n", - "!pip install torch transformers peft datasets trl==0.12.2 -q" - ] - }, - { - "cell_type": "markdown", - "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", - "metadata": {}, - "outputs": [], - "source": [ - "import hashlib\n", - "import os\n", - "\n", - "import numpy as np\n", - "import requests\n", - "import torch\n", - "import transformers\n", - "from datasets import load_dataset\n", - "from peft import LoraConfig, get_peft_model\n", - "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n", - "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments\n", - "from transformers.trainer_callback import PrinterCallback\n", - "from trl import SFTTrainer\n", - "\n", - "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", - "from openfl.experimental.workflow.placement import aggregator, collaborator\n", - "from openfl.experimental.workflow.runtime import LocalRuntime" - ] - }, - { - "cell_type": "markdown", - "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", - "metadata": {}, - "source": [ - "## Acquiring and preprocessing dataset" - ] - }, - { - "cell_type": "markdown", - "id": "7ba1d8b6-8a5b-41a2-8c77-c9a85e869cda", - "metadata": {}, - "source": [ - "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d615d626-8727-4169-b2a6-3ba15c3cdb95", - "metadata": {}, - "outputs": [], - "source": [ - "def file_checksum(file_path, algorithm=\"sha256\"):\n", - " \"\"\"\n", - " Calculate the checksum of a file using the specified hashing algorithm.\n", - "\n", - " Parameters:\n", - " file_path (str): The path to the file for which the checksum is to be calculated.\n", - " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", - "\n", - " Returns:\n", - " str: The calculated checksum of the file.\n", - " \"\"\"\n", - " hash_func = hashlib.new(algorithm)\n", - " with open(file_path, \"rb\") as f:\n", - " for chunk in iter(lambda: f.read(4096), b\"\"):\n", - " hash_func.update(chunk)\n", - " return hash_func.hexdigest()\n", - "\n", - "\n", - "if not os.path.exists(\"math_10k.json\"):\n", - " r = requests.get(\n", - " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", - " )\n", - " with open(\n", - " \"math_10k.json\",\n", - " \"wb\",\n", - " ) as f:\n", - " f.write(r.content)\n", - "\n", - " actual_checksum = file_checksum(\"math_10k.json\")\n", - " if (\n", - " actual_checksum\n", - " != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", - " ):\n", - " raise ValueError(\n", - " \"Checksum verification failed. The file may have been altered.\"\n", - " )\n", - "\n", - "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")" - ] - }, - { - "cell_type": "markdown", - "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78", - "metadata": {}, - "source": [ - "## Initialize arguments and configurations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eada9809-468a-47c6-9b03-55aa887c9487", - "metadata": {}, - "outputs": [], - "source": [ - "training_config = {\n", - " \"bf16\": True,\n", - " \"use_cpu\": True,\n", - " \"do_eval\": False,\n", - " \"learning_rate\": 5.0e-06,\n", - " \"log_level\": \"info\",\n", - " \"logging_steps\": 20,\n", - " \"lr_scheduler_type\": \"cosine\",\n", - " \"num_train_epochs\": 1,\n", - " \"output_dir\": \"./checkpoint_dir\",\n", - " \"overwrite_output_dir\": True,\n", - " \"per_device_eval_batch_size\": 1,\n", - " \"per_device_train_batch_size\": 1,\n", - " \"save_steps\": 100,\n", - " \"save_total_limit\": 1,\n", - " \"seed\": 0,\n", - " \"gradient_checkpointing\": True,\n", - " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n", - " \"warmup_ratio\": 0.2,\n", - "}\n", - "\n", - "peft_config = {\n", - " \"r\": 1,\n", - " \"lora_alpha\": 2,\n", - " \"lora_dropout\": 0.05,\n", - " \"bias\": \"none\",\n", - " \"task_type\": \"CAUSAL_LM\",\n", - " \"target_modules\": \"all-linear\",\n", - " \"modules_to_save\": None,\n", - "}\n", - "model_kwargs = dict(\n", - " use_cache=False,\n", - " trust_remote_code=True,\n", - " torch_dtype=torch.bfloat16,\n", - " device_map=None,\n", - ")\n", - "train_conf = TrainingArguments(**training_config)\n", - "peft_conf = LoraConfig(**peft_config)" - ] - }, - { - "cell_type": "markdown", - "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", - "metadata": {}, - "source": [ - "## Load and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", - "metadata": {}, - "outputs": [], - "source": [ - "checkpoint_path = \"NyxKrage/Microsoft_Phi-4\"\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " checkpoint_path, return_dict=True, **model_kwargs\n", - ")\n", - "model = get_peft_model(model, peft_conf)\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n", - "sequence_max_length = 512\n", - "val_set_size = 2000\n", - "tokenizer.pad_token_id = 0 # we want this to be different from the eos token\n", - "tokenizer.padding_side = \"left\" # Allow batched inference" - ] - }, - { - "cell_type": "markdown", - "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", - "metadata": {}, - "source": [ - "## Preprocess dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", - "metadata": {}, - "outputs": [], - "source": [ - "def generate_prompt(data_point):\n", - " \"\"\"\n", - " Generate a prompt based on the given data point.\n", - "\n", - " Parameters:\n", - " data_point (dict): A dictionary containing the instruction, input, and output.\n", - "\n", - " Returns:\n", - " str: The generated prompt as a string.\n", - " \"\"\"\n", - " if data_point[\"input\"]:\n", - " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n", - "\n", - " ### Instruction:\n", - " {data_point[\"instruction\"]}\n", - " \n", - " ### Input:\n", - " {data_point[\"input\"]}\n", - " \n", - " ### Response:\n", - " {data_point[\"output\"]}\"\"\"\n", - " else:\n", - " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n", - "\n", - " ### Instruction:\n", - " {data_point[\"instruction\"]}\n", - " \n", - " ### Response:\n", - " {data_point[\"output\"]}\"\"\"\n", - "\n", - "\n", - "def tokenize(prompt, add_eos_token=True):\n", - " \"\"\"\n", - " Tokenize the given prompt.\n", - "\n", - " Parameters:\n", - " prompt (str): The prompt to be tokenized.\n", - " add_eos_token (bool): Whether to add an end-of-sequence token (default is True).\n", - "\n", - " Returns:\n", - " dict: A dictionary containing the tokenized input IDs and attention mask.\n", - " \"\"\"\n", - " result = tokenizer(\n", - " prompt,\n", - " truncation=True,\n", - " max_length=sequence_max_length,\n", - " padding=False,\n", - " return_tensors=None,\n", - " )\n", - " if (\n", - " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n", - " and len(result[\"input_ids\"]) < sequence_max_length\n", - " and add_eos_token\n", - " ):\n", - " result[\"input_ids\"].append(tokenizer.eos_token_id)\n", - " result[\"attention_mask\"].append(1)\n", - "\n", - " result[\"labels\"] = result[\"input_ids\"].copy()\n", - "\n", - " return result\n", - "\n", - "\n", - "def generate_and_tokenize_prompt(data_point):\n", - " \"\"\"\n", - " Generate and tokenize a prompt based on the given data point.\n", - "\n", - " Parameters:\n", - " data_point (dict): A dictionary containing the instruction, input, and output.\n", - "\n", - " Returns:\n", - " dict: A dictionary containing the tokenized input IDs, attention mask, and labels.\n", - " \"\"\"\n", - " full_prompt = generate_prompt(data_point)\n", - " tokenized_full_prompt = tokenize(full_prompt)\n", - " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n", - " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n", - " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n", - "\n", - " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\n", - " \"labels\"\n", - " ][user_prompt_len:]\n", - " return tokenized_full_prompt\n", - "\n", - "\n", - "train_val = raw_dataset[\"train\"].train_test_split(\n", - " test_size=val_set_size, shuffle=True, seed=42\n", - ")\n", - "\n", - "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n", - "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))" - ] - }, - { - "cell_type": "markdown", - "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", - "metadata": {}, - "source": [ - "## Define Federated Averaging Method\n", - "The FedAvg method is used to average the models from all the collaborators after training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", - "metadata": {}, - "outputs": [], - "source": [ - "def FedAvg(peft_params, model, weights=None):\n", - " \"\"\"\n", - " Perform Federated Averaging (FedAvg) on the model parameters.\n", - "\n", - " Parameters:\n", - " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n", - " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n", - " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", - "\n", - " Returns:\n", - " torch.nn.Module: The model with the averaged parameters applied.\n", - " \"\"\"\n", - " state_dicts = peft_params\n", - " state_dict = get_peft_model_state_dict(model)\n", - " for key in peft_params[0]:\n", - " dtype = state_dicts[0][key].dtype\n", - " state_dict[key] = torch.from_numpy(\n", - " np.average(\n", - " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n", - " )\n", - " ).to(dtype)\n", - " set_peft_model_state_dict(model, state_dict)\n", - " return model" - ] - }, - { - "cell_type": "markdown", - "id": "810eb75e", - "metadata": {}, - "source": [ - "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n", - "\n", - "![Workflow Interface](../../../../docs/images/workflow_interface.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58298e8e-ab9e-4377-966e-143823441697", - "metadata": {}, - "outputs": [], - "source": [ - "class FederatedFlow(FLSpec):\n", - " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", - " \"\"\"\n", - " Initialize the class with the given model, optimizer, and number of rounds.\n", - "\n", - " Parameters:\n", - " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n", - " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n", - " rounds (int, optional): The number of rounds for training or processing (default is 3).\n", - " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n", - "\n", - " Raises:\n", - " ValueError: If no model is provided.\n", - " \"\"\"\n", - " super().__init__(**kwargs)\n", - " if model is not None:\n", - " self.model = model\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " self.optimizer = optimizer\n", - " else:\n", - " raise ValueError(\"No model inputted\")\n", - "\n", - " self.rounds = rounds\n", - " \n", - "\n", - " @aggregator\n", - " def start(self):\n", - " \"\"\"\n", - " Initialize the model and set up the collaborators for federated learning.\n", - "\n", - " This method performs the initial setup for the model, including setting the\n", - " collaborators, initializing private variables, and starting the first round\n", - " of the federated learning process.\n", - " \"\"\"\n", - " print(f\"Performing initialization for model\")\n", - " self.collaborators = self.runtime.collaborators\n", - " self.current_round = 0\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " )\n", - "\n", - " \n", - " @collaborator\n", - " def aggregated_model_validation(self):\n", - " \"\"\"\n", - " Perform aggregated model validation for a collaborator.\n", - "\n", - " This method loads the model, applies the PEFT configuration, and evaluates\n", - " the model using the provided training and evaluation datasets. The validation\n", - " score is then stored and the next step in the process is triggered.\n", - " \"\"\"\n", - " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n", - " self.model = AutoModelForCausalLM.from_pretrained(\n", - " checkpoint_path, return_dict=True, **model_kwargs\n", - " )\n", - " self.model = get_peft_model(self.model, peft_conf)\n", - " set_peft_model_state_dict(self.model, self.peft_params)\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " trainer.remove_callback(PrinterCallback)\n", - " out = trainer.evaluate()\n", - " self.agg_validation_score = out[\"eval_loss\"]\n", - " print(f\"{self.input} value of {self.agg_validation_score}\")\n", - " self.next(self.train)\n", - "\n", - " @collaborator\n", - " def train(self):\n", - " \"\"\"\n", - " Train the model for a collaborator.\n", - "\n", - " This method trains the model using the provided training and evaluation datasets.\n", - " The training loss is stored, the model is saved, and the next step in the process\n", - " is triggered.\n", - " \"\"\"\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " out = trainer.train()\n", - " self.loss = out.training_loss\n", - " trainer.save_model()\n", - " self.training_completed = True\n", - " self.next(self.local_model_validation)\n", - "\n", - " @collaborator\n", - " def local_model_validation(self):\n", - " \"\"\"\n", - " Perform local model validation for a collaborator.\n", - "\n", - " This method evaluates the model using the provided training and evaluation datasets.\n", - " The validation score is stored, the PEFT parameters are updated, and the next step\n", - " in the process is triggered.\n", - " \"\"\"\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=processed_train_dataset,\n", - " eval_dataset=processed_test_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - " out = trainer.evaluate()\n", - " self.local_validation_score = out[\"eval_loss\"]\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " print(f\"Doing local model validation for collaborator {self.input}\")\n", - " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n", - "\n", - " @aggregator\n", - " def join(self, inputs):\n", - " \"\"\"\n", - " Aggregate the results from all collaborators and update the model.\n", - "\n", - " This method calculates the average loss, aggregated model accuracy, and local model\n", - " accuracy from all collaborators. The model parameters are updated using Federated\n", - " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n", - " \"\"\"\n", - " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", - " self.aggregated_model_accuracy = sum(\n", - " input.agg_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " self.local_model_accuracy = sum(\n", - " input.local_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " print(\n", - " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", - " )\n", - " print(f\"Average training loss = {self.average_loss}\")\n", - " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", - "\n", - " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - "\n", - " self.model.save_pretrained(\"./aggregated/model\")\n", - " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", - " self.current_round += 1\n", - " if self.current_round < self.rounds:\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " exclude=[\"model\"],\n", - " )\n", - " else:\n", - " self.next(self.end)\n", - "\n", - " @aggregator\n", - " def end(self):\n", - " \"\"\"\n", - " End the federated learning process.\n", - "\n", - " This method marks the end of the federated learning process and performs any\n", - " necessary cleanup or finalization steps.\n", - " \"\"\"\n", - " print(f\"This is the end of the flow\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", - "metadata": {}, - "source": [ - "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_dataset` and `eval_dataset` for each of the collaborators. These are **private_attributes** that are exposed only through the runtime. Each participant has its own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task.\n", - "\n", - "Below, we segment shards of the Math_10k dataset for **two collaborators**: Portland and Seattle. Each has their own slice of the dataset that's accessible via the `train_dataset` or `eval_dataset` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transferring from collaborator to aggregator, or vice versa." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", - "metadata": {}, - "outputs": [], - "source": [ - "# Setup participants\n", - "_aggregator = Aggregator()\n", - "_aggregator.private_attributes = {}\n", - "\n", - "# Setup collaborators with private attributes\n", - "collaborator_names = [\n", - " \"Portland\",\n", - " \"Seattle\",\n", - "]\n", - "_collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "\n", - "for idx, current_collaborator in enumerate(_collaborators):\n", - " # Set the private attributes of the Collaborator to include their specific training and testing data loaders\n", - " current_collaborator.private_attributes = {\n", - " \"train_dataset\": processed_train_dataset.shard(\n", - " num_shards=len(_collaborators), index=idx\n", - " ),\n", - " \"eval_dataset\": processed_test_dataset.shard(\n", - " num_shards=len(_collaborators), index=idx\n", - " ),\n", - " }\n", - "\n", - "local_runtime = LocalRuntime(\n", - " aggregator=_aggregator, collaborators=_collaborators, backend=\"single_process\"\n", - ")\n", - "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")" - ] - }, - { - "cell_type": "markdown", - "id": "9cb61fc0", - "metadata": {}, - "source": [ - "## Run Experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", - "metadata": {}, - "outputs": [], - "source": [ - "flflow = FederatedFlow(model, rounds=2)\n", - "flflow.runtime = local_runtime\n", - "flflow.run()\n", - "\n", - "# Determine the final model accuracy:\n", - "print(f'\\nFinal aggregated model accuracy for {flflow.rounds} rounds of training: {flflow.aggregated_model_accuracy}')" - ] - }, - { - "cell_type": "markdown", - "id": "7bc8fe27", - "metadata": {}, - "source": [ - "## 🎉 Congratulations! 🎉\n", - "\n", - "Now that you've completed this notebook, check out our [other tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/)\n", - "\n", - "- Using the LocalRuntime Ray Backend for dedicated GPU access\n", - "- Vertical Federated Learning\n", - "- Model Watermarking\n", - "- Differential Privacy\n", - "- And More!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/openfl/utilities/phi_utils.py b/openfl/utilities/phi_utils.py new file mode 100644 index 0000000000..e1c2415ed1 --- /dev/null +++ b/openfl/utilities/phi_utils.py @@ -0,0 +1,488 @@ +""" +Utility functions for Phi-4 model quantization and federated learning experiments. +This module contains: +- Memory tracking utilities +- Visualization functions for comparing 4-bit and 8-bit quantization +""" + +# flake8: noqa: E501, E722 + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import torch +from matplotlib.ticker import EngFormatter + + +def get_gpu_memory_info(): + """Get GPU memory usage information in MB.""" + try: + if torch.cuda.is_available(): + allocated = torch.cuda.memory_allocated() / (1024 * 1024) + reserved = torch.cuda.memory_reserved() / (1024 * 1024) + max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024) + return {"allocated": allocated, "reserved": reserved, "max_allocated": max_allocated} + else: + return {"allocated": 0, "reserved": 0, "max_allocated": 0} + except: + return {"allocated": 0, "reserved": 0, "max_allocated": 0} + + +class MemoryTracker: + """Track GPU memory usage during training""" + + def __init__(self, collaborator_name, quant_type): + self.collaborator_name = collaborator_name + self.quant_type = quant_type + self.timestamps = {} + self.peak = {"allocated": 0, "reserved": 0, "max_allocated": 0} + self.training_loss = None + self.eval_loss = None + + def log(self, timestamp): + """Log current memory usage at a specific timestamp""" + self.timestamps[timestamp] = get_gpu_memory_info() + + def log_loss(self, training_loss=None, eval_loss=None): + """Log training or evaluation loss""" + if training_loss is not None: + self.training_loss = training_loss + if eval_loss is not None: + self.eval_loss = eval_loss + + def update_peak(self): + """Update peak memory usage values""" + current = get_gpu_memory_info() + self.peak["allocated"] = max(self.peak["allocated"], current["allocated"]) + self.peak["reserved"] = max(self.peak["reserved"], current["reserved"]) + self.peak["max_allocated"] = max(self.peak["max_allocated"], current["max_allocated"]) + + def reset_peak(self): + """Reset peak memory usage values""" + self.peak = {"allocated": 0, "reserved": 0, "max_allocated": 0} + + def report(self): + """Print memory usage report""" + print(f"\n==== Memory Usage Report for {self.collaborator_name} ({self.quant_type}) ====") + print("Peak Memory Usage:") + print(f" Allocated: {self.peak['allocated']:.2f} MB") + print(f" Reserved: {self.peak['reserved']:.2f} MB") + print(f" Max Allocated: {self.peak['max_allocated']:.2f} MB") + + print("\nMemory Usage by Stage:") + for timestamp, mem in self.timestamps.items(): + print(f" {timestamp}:") + print(f" Allocated: {mem['allocated']:.2f} MB") + print(f" Reserved: {mem['reserved']:.2f} MB") + print(f" Max Allocated: {mem['max_allocated']:.2f} MB") + + print("\nPerformance Metrics:") + if self.training_loss is not None: + print(f" Training Loss: {self.training_loss:.4f}") + if self.eval_loss is not None: + print(f" Evaluation Loss: {self.eval_loss:.4f}") + print("-" * 50) + + def get_stats(self): + """Get all statistics as a dictionary""" + stats = { + "peak_allocated": self.peak["allocated"], + "peak_reserved": self.peak["reserved"], + "peak_max_allocated": self.peak["max_allocated"], + "quant_type": self.quant_type, + "training_loss": self.training_loss, + "eval_loss": self.eval_loss, + } + for timestamp, mem in self.timestamps.items(): + stats[f"{timestamp}_allocated"] = mem["allocated"] + stats[f"{timestamp}_reserved"] = mem["reserved"] + stats[f"{timestamp}_max_allocated"] = mem["max_allocated"] + return stats + + +def plot_memory_metrics(flow_4bit, flow_8bit): # NOQA: C901 + """Plot and compare memory metrics between 4-bit and 8-bit quantization.""" + try: + # Create figure with multiple subplots + fig, axs = plt.subplots(2, 2, figsize=(16, 12)) + fig.suptitle("4-bit vs 8-bit Quantization Comparison", fontsize=16) + + # Colors for consistent plotting + colors_4bit = {"Portland": "blue", "Seattle": "green"} + colors_8bit = {"Portland": "darkblue", "Seattle": "darkgreen"} + markers_4bit = {"Portland": "o", "Seattle": "s"} + markers_8bit = {"Portland": "^", "Seattle": "D"} + + # Flatten the metric data for plotting + memory_data = [] + for quant, flow in [("4-bit", flow_4bit), ("8-bit", flow_8bit)]: + stats = flow.all_memory_stats + for collab, rounds_data in stats.items(): + for round_name, metrics in rounds_data.items(): + round_num = int(round_name.split("_")[1]) + row = { + "Collaborator": collab, + "Round": round_num, + "Quantization": quant, + "Peak Memory (MB)": metrics.get("peak_max_allocated", 0), + "Training Loss": metrics.get("training_loss", 0), + "Eval Loss": metrics.get("eval_loss", 0), + } + memory_data.append(row) + + df = pd.DataFrame(memory_data) + + # Plot 1: Peak Memory Usage by Round + axs[0, 0].set_title("Peak Memory Usage by Round") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + axs[0, 0].plot( + subset["Round"], + subset["Peak Memory (MB)"], + marker=marker, + linestyle="-", + label=f"{collab} ({quant_type})", + color=color, + ) + + axs[0, 0].set_xlabel("Round") + axs[0, 0].set_ylabel("Memory (MB)") + axs[0, 0].legend() + axs[0, 0].grid(True, alpha=0.3) + axs[0, 0].yaxis.set_major_formatter(EngFormatter(unit="B")) + + # Plot 2: Training Loss by Round + axs[0, 1].set_title("Training Loss by Round") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + axs[0, 1].plot( + subset["Round"], + subset["Training Loss"], + marker=marker, + linestyle="-", + label=f"{collab} ({quant_type})", + color=color, + ) + + axs[0, 1].set_xlabel("Round") + axs[0, 1].set_ylabel("Loss") + axs[0, 1].legend() + axs[0, 1].grid(True, alpha=0.3) + + # Plot 3: Eval Loss by Round + axs[1, 0].set_title("Evaluation Loss by Round") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + axs[1, 0].plot( + subset["Round"], + subset["Eval Loss"], + marker=marker, + linestyle="-", + label=f"{collab} ({quant_type})", + color=color, + ) + + axs[1, 0].set_xlabel("Round") + axs[1, 0].set_ylabel("Loss") + axs[1, 0].legend() + axs[1, 0].grid(True, alpha=0.3) + + # Plot 4: Memory vs Loss (bubble chart) + axs[1, 1].set_title("Memory Usage vs. Evaluation Loss") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + + # Size proportional to round number for visual differentiation + sizes = [100 * (r + 1) for r in subset["Round"]] + + axs[1, 1].scatter( + subset["Peak Memory (MB)"], + subset["Eval Loss"], + s=sizes, + alpha=0.7, + label=f"{collab} ({quant_type})", + color=color, + marker=marker, + ) + + # Add round number annotations + for _, row in subset.iterrows(): + axs[1, 1].annotate( + f"R{int(row['Round'])}", + (row["Peak Memory (MB)"], row["Eval Loss"]), + xytext=(5, 5), + textcoords="offset points", + ) + + axs[1, 1].set_xlabel("Peak Memory (MB)") + axs[1, 1].set_ylabel("Evaluation Loss") + axs[1, 1].legend() + axs[1, 1].grid(True, alpha=0.3) + axs[1, 1].xaxis.set_major_formatter(EngFormatter(unit="B")) + + plt.tight_layout() + plt.subplots_adjust(top=0.92) + plt.show() + + # Print summary comparison + print("\n==== Performance Summary ====\n") + # Group by quantization and compute means + summary = ( + df.groupby("Quantization") + .agg({"Peak Memory (MB)": "mean", "Training Loss": "mean", "Eval Loss": "mean"}) + .reset_index() + ) + + # Calculate percentage difference + mem_diff_pct = ( + (summary.loc[1, "Peak Memory (MB)"] - summary.loc[0, "Peak Memory (MB)"]) + / summary.loc[0, "Peak Memory (MB)"] + * 100 + ) + + eval_diff_pct = ( + (summary.loc[1, "Eval Loss"] - summary.loc[0, "Eval Loss"]) + / summary.loc[0, "Eval Loss"] + * 100 + ) + + print("Memory Usage Comparison:") + print(f" 4-bit Avg: {summary.loc[0, 'Peak Memory (MB)']:.2f} MB") + print(f" 8-bit Avg: {summary.loc[1, 'Peak Memory (MB)']:.2f} MB") + print( + f" Difference: {abs(mem_diff_pct):.1f}% {'more' if mem_diff_pct > 0 else 'less'} memory with 8-bit" + ) + + print("\nEvaluation Loss Comparison:") + print(f" 4-bit Avg: {summary.loc[0, 'Eval Loss']:.4f}") + print(f" 8-bit Avg: {summary.loc[1, 'Eval Loss']:.4f}") + print( + f" Difference: {abs(eval_diff_pct):.1f}% {'higher' if eval_diff_pct > 0 else 'lower'} loss with 8-bit" + ) + + loss_efficiency = (summary.loc[0, "Eval Loss"] - summary.loc[1, "Eval Loss"]) / ( + summary.loc[0, "Peak Memory (MB)"] - summary.loc[1, "Peak Memory (MB)"] + ) + + if loss_efficiency > 0: + efficiency_msg = "8-bit provides more efficiency memory usage relative to loss" + else: + efficiency_msg = "4-bit provides more efficiency memory usage relative to loss" + + print(f"\nEfficiency Analysis: {efficiency_msg}") + except ImportError: + print( + "Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas" + ) + except Exception as e: + print(f"Error plotting metrics: {str(e)}") + + +def plot_loss_metrics(flow_4bit, flow_8bit): # NOQA: C901 + """Plot training and evaluation loss metrics comparing 4-bit and 8-bit quantization""" + # Extract and organize loss data + loss_data = [] + + # Helper function to safely convert tensor to float value + def tensor_to_float(val): + if val is None: + return None + if isinstance(val, torch.Tensor): + return val.detach().cpu().float().numpy().item() + return val + + # Process 4-bit data + for collab, rounds_data in flow_4bit.all_memory_stats.items(): + for round_name, stats in rounds_data.items(): + round_num = int(round_name.split("_")[1]) if "_" in round_name else 0 + quant_type = stats.get("quant_type", "4bit") + training_loss = tensor_to_float(stats.get("training_loss")) + eval_loss = tensor_to_float(stats.get("eval_loss")) + + if training_loss is not None or eval_loss is not None: + loss_data.append( + { + "Collaborator": collab, + "Round": round_name, + "Round Number": round_num, + "Training Loss": training_loss, + "Eval Loss": eval_loss, + "Quantization": quant_type, + } + ) + + # Process 8-bit data if provided + if flow_8bit is not None: + for collab, rounds_data in flow_8bit.all_memory_stats.items(): + for round_name, stats in rounds_data.items(): + round_num = int(round_name.split("_")[1]) if "_" in round_name else 0 + quant_type = stats.get("quant_type", "8bit") + training_loss = tensor_to_float(stats.get("training_loss")) + eval_loss = tensor_to_float(stats.get("eval_loss")) + + if training_loss is not None or eval_loss is not None: + loss_data.append( + { + "Collaborator": collab, + "Round": round_name, + "Round Number": round_num, + "Training Loss": training_loss, + "Eval Loss": eval_loss, + "Quantization": quant_type, + } + ) + + loss_df = pd.DataFrame(loss_data) + + # Create a figure with subplots for loss metrics + fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={"height_ratios": [1, 1]}) + + # 1. Training loss across rounds (top plot) + group_var = "Quantization" if flow_8bit else "Collaborator" + + sns.lineplot( + x="Round Number", + y="Training Loss", + hue=group_var, + data=loss_df, + marker="o", + sort=True, + linewidth=3, + markersize=10, + ax=axes[0], + ) + axes[0].set_title("Training Loss Across Rounds", fontsize=14, fontweight="bold") + axes[0].set_xlabel("Round", fontsize=12) + axes[0].set_ylabel("Loss", fontsize=12) + axes[0].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc="upper left") + + # 2. Evaluation loss across rounds (bottom plot) + sns.lineplot( + x="Round Number", + y="Eval Loss", + hue=group_var, + data=loss_df, + marker="o", + sort=True, + linewidth=3, + markersize=10, + ax=axes[1], + ) + axes[1].set_title("Evaluation Loss Across Rounds", fontsize=14, fontweight="bold") + axes[1].set_xlabel("Round", fontsize=12) + axes[1].set_ylabel("Loss", fontsize=12) + axes[1].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc="upper left") + + plt.tight_layout() + plt.show() + + # Print summary statistics + if flow_8bit: + print("\n==== Loss Comparison: 4-bit vs 8-bit ====\n") + + # Group by quantization and compute means + summary = loss_df.groupby("Quantization").agg( + {"Training Loss": ["mean", "std"], "Eval Loss": ["mean", "std"]} + ) + + print( + f"Training Loss (4-bit): {summary.loc['4bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Training Loss', 'std')]:.4f}" + ) + print( + f"Training Loss (8-bit): {summary.loc['8bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Training Loss', 'std')]:.4f}" + ) + print( + f"\nEval Loss (4-bit): {summary.loc['4bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Eval Loss', 'std')]:.4f}" + ) + print( + f"Eval Loss (8-bit): {summary.loc['8bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Eval Loss', 'std')]:.4f}" + ) + + +def plot_aggregated_metrics(flow_4bit, flow_8bit): + """Plot aggregated metrics comparing 4-bit and 8-bit quantization""" + # Create a figure with subplots for aggregated metrics + fig, axes = plt.subplots(1, 2, figsize=(16, 6)) + + # Helper function to safely convert tensor to float value + def tensor_to_float(val): + if val is None: + return None + if isinstance(val, torch.Tensor): + return val.detach().cpu().float().numpy().item() + return val + + # Convert any tensor values to CPU before plotting + loss_history_4bit = [tensor_to_float(x) for x in flow_4bit.average_loss_history] + loss_history_8bit = [tensor_to_float(x) for x in flow_8bit.average_loss_history] + agg_model_loss_4bit = [tensor_to_float(x) for x in flow_4bit.agg_model_loss_history] + agg_model_loss_8bit = [tensor_to_float(x) for x in flow_8bit.agg_model_loss_history] + local_model_loss_4bit = [tensor_to_float(x) for x in flow_4bit.local_model_loss_history] + local_model_loss_8bit = [tensor_to_float(x) for x in flow_8bit.local_model_loss_history] + + # Setup data + rounds = list(range(len(loss_history_4bit))) + + # Plot average loss history + axes[0].plot(rounds, loss_history_4bit, "bo-", linewidth=2, markersize=8, label="4-bit") + axes[0].plot(rounds, loss_history_8bit, "ro-", linewidth=2, markersize=8, label="8-bit") + axes[0].set_title("Average Training Loss by Round", fontsize=14, fontweight="bold") + axes[0].set_xlabel("Round", fontsize=12) + axes[0].set_ylabel("Loss", fontsize=12) + axes[0].grid(True, alpha=0.3) + axes[0].legend(fontsize=10) + + # Plot final metrics comparison + metrics = ["Avg Training Loss", "Agg Model Loss", "Local Model Loss"] + values_4bit = [loss_history_4bit[-1], agg_model_loss_4bit[-1], local_model_loss_4bit[-1]] + values_8bit = [loss_history_8bit[-1], agg_model_loss_8bit[-1], local_model_loss_8bit[-1]] + + x = np.arange(len(metrics)) + width = 0.35 + + bars1 = axes[1].bar(x - width / 2, values_4bit, width, label="4-bit", color="blue", alpha=0.7) + bars2 = axes[1].bar(x + width / 2, values_8bit, width, label="8-bit", color="red", alpha=0.7) + + # Add value labels on bars + for bars in [bars1, bars2]: + for bar in bars: + height = bar.get_height() + axes[1].annotate( + f"{height:.4f}", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha="center", + va="bottom", + fontsize=9, + ) + + axes[1].set_title("Final Metrics Comparison", fontsize=14, fontweight="bold") + axes[1].set_ylabel("Loss", fontsize=12) + axes[1].set_xticks(x) + axes[1].set_xticklabels(metrics, rotation=15) + axes[1].legend(loc="upper right", fontsize=10) + axes[1].grid(True, alpha=0.3) + + plt.tight_layout() + plt.show() + + # Print percent differences + print("\n==== Percentage Difference (8-bit vs 4-bit) ====\n") + for i, metric in enumerate(metrics): + pct_diff = ((values_8bit[i] - values_4bit[i]) / values_4bit[i]) * 100 + direction = "higher" if pct_diff > 0 else "lower" + print(f"{metric}: 8-bit is {abs(pct_diff):.2f}% {direction} than 4-bit")