Add Call GPT-4V On Each Video Chunk Sequentially cell in video sample

Zhi Zhou · Zhi Zhou · commit 8a86f22fe232 · 2023-11-17T13:58:25.000-08:00
diff --git a/Basic_Samples/GPT-4V/requirements.txt b/Basic_Samples/GPT-4V/requirements.txt
@@ -4,4 +4,5 @@ jupyter
 # Other packages needed to run the notebook samples
 requests
 pillow
-matplotlib
+matplotlib
+moviepy
diff --git a/Basic_Samples/GPT-4V/video_chatcompletions_example_restapi.ipynb b/Basic_Samples/GPT-4V/video_chatcompletions_example_restapi.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "f4b3d21a",
    "metadata": {},
    "outputs": [],
@@ -22,7 +22,8 @@
     "import os\n",
     "import requests\n",
     "import time\n",
-    "import re"
+    "import re\n",
+    "from moviepy.editor import VideoFileClip"
    ]
   },
   {
@@ -38,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "id": "fd85fb30",
    "metadata": {},
    "outputs": [],
@@ -140,15 +141,105 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Call GPT-4V"
+    "### Define GPT-4V API Call"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "b6165c63",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# Define GPT-4V API call\n",
+    "def call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message):\n",
+    "    # Construct the API request URL\n",
+    "    api_url = f\"{openai_api_base}/openai/deployments/{deployment_name}/extensions/chat/completions?api-version={openai_api_version}\"\n",
+    "\n",
+    "    # Including the api-key in HTTP headers\n",
+    "    headers = {\n",
+    "        \"Content-Type\": \"application/json\",\n",
+    "        \"api-key\": openai_api_key,\n",
+    "    }\n",
+    "\n",
+    "    # Payload for the request\n",
+    "    payload = {\n",
+    "    \"dataSources\": [\n",
+    "        {\n",
+    "        \"type\": \"AzureComputerVisionVideoIndex\",\n",
+    "        \"parameters\": {\n",
+    "            \"computerVisionBaseUrl\": f\"{vision_api_endpoint}/computervision\",\n",
+    "            \"computerVisionApiKey\": vision_api_key,\n",
+    "            \"indexName\": video_index_name,\n",
+    "            \"videoUrls\": [video_SAS_url]\n",
+    "        }\n",
+    "        }\n",
+    "    ],\n",
+    "    \"enhancements\": {\n",
+    "            \"video\": {\n",
+    "                \"enabled\": True\n",
+    "            }\n",
+    "        },\n",
+    "    \"messages\": [\n",
+    "        {\n",
+    "        \"role\": \"system\",\n",
+    "        \"content\": [\n",
+    "            sys_message\n",
+    "        ]\n",
+    "        },\n",
+    "        {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\n",
+    "            \"acv-document-id\": video_id\n",
+    "            },\n",
+    "        ]\n",
+    "        },\n",
+    "        {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            user_prompt\n",
+    "        ]\n",
+    "        }, \n",
+    "    ],\n",
+    "    \"temperature\": 0.7,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"max_tokens\": 800\n",
+    "    }\n",
+    "\n",
+    "    # Send the request and handle the response\n",
+    "    try:\n",
+    "        response = requests.post(api_url, headers=headers, json=payload)\n",
+    "        response.raise_for_status()  # Raise an error for bad HTTP status codes\n",
+    "        return response.json()\n",
+    "    except requests.RequestException as e:\n",
+    "        raise SystemExit(f\"Failed to make the request. Error: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Call GPT-4V On The Entire Video"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "b6165c63",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The advertisement video highlights the features of Microsoft's new virtual assistant, Copilot.\n",
+      "Throughout the video, the assistant is shown helping users in various ways, such as creating pictures, exploring variations, reading, and organizing plans.\n",
+      "The ad uses colorful, serene, and visually pleasing backgrounds to convey a sense of calmness and creativity.\n",
+      "It demonstrates how Copilot can assist in enhancing the user’s digital experience, making it more organized, creative, and productive.\n",
+      "The main message is to introduce and promote Copilot as a helpful and reliable assistant that makes the user's digital interaction easier and more enjoyable.\n"
+     ]
+    }
+   ],
    "source": [
     "# System messages and user prompt\n",
     "sys_message = \"\"\"\n",
@@ -160,79 +251,140 @@
     "\"\"\"\n",
     "user_prompt = \"Summarize the ad video\"\n",
     "\n",
-    "# Construct the API request URL\n",
-    "api_url = f\"{openai_api_base}/openai/deployments/{deployment_name}/extensions/chat/completions?api-version={openai_api_version}\"\n",
-    "\n",
-    "# Including the api-key in HTTP headers\n",
-    "headers = {\n",
-    "    \"Content-Type\": \"application/json\",\n",
-    "    \"api-key\": openai_api_key,\n",
-    "}\n",
-    "\n",
-    "# Payload for the request\n",
-    "payload = {\n",
-    "  \"dataSources\": [\n",
-    "    {\n",
-    "      \"type\": \"AzureComputerVisionVideoIndex\",\n",
-    "      \"parameters\": {\n",
-    "        \"computerVisionBaseUrl\": f\"{vision_api_endpoint}/computervision\",\n",
-    "        \"computerVisionApiKey\": vision_api_key,\n",
-    "        \"indexName\": video_index_name,\n",
-    "        \"videoUrls\": [video_SAS_url]\n",
-    "      }\n",
-    "    }\n",
-    "  ],\n",
-    "  \"enhancements\": {\n",
-    "        \"video\": {\n",
-    "            \"enabled\": True\n",
-    "        }\n",
-    "    },\n",
-    "  \"messages\": [\n",
-    "    {\n",
-    "      \"role\": \"system\",\n",
-    "      \"content\": [\n",
-    "        sys_message\n",
-    "      ]\n",
-    "    },\n",
-    "    {\n",
-    "      \"role\": \"user\",\n",
-    "      \"content\": [\n",
-    "        {\n",
-    "          \"acv-document-id\": video_id\n",
-    "        },\n",
-    "      ]\n",
-    "    },\n",
-    "    {\n",
-    "      \"role\": \"user\",\n",
-    "      \"content\": [\n",
-    "        user_prompt\n",
-    "      ]\n",
-    "    }, \n",
-    "  ],\n",
-    "  \"temperature\": 0.7,\n",
-    "  \"top_p\": 0.95,\n",
-    "  \"max_tokens\": 800\n",
-    "}\n",
-    "\n",
-    "# Send the request and handle the response\n",
+    "# Call GPT-4V API and print the response\n",
     "try:\n",
-    "    response = requests.post(api_url, headers=headers, json=payload)\n",
-    "    response.raise_for_status()  # Raise an error for bad HTTP status codes\n",
-    "    response_content = response.json()\n",
-    "    text = response_content['choices'][0]['message']['content']\n",
+    "    response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)\n",
+    "    text = response['choices'][0]['message']['content']\n",
     "    sentences = re.split(r'(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s', text)\n",
     "    for sentence in sentences:  # Print the content of the response\n",
     "        print(sentence)\n",
     "except requests.RequestException as e:\n",
     "    raise SystemExit(f\"Failed to make the request. Error: {e}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Call GPT-4V On Each Video Chunk Sequentially"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Video Length: 46.13 seconds\n",
+      "Segment 1: How many scenes from 0s to 20s?\n",
+      "Segment 2: How many scenes from 20s to 40s?\n",
+      "Segment 3: How many scenes from 40s to 46.13s?\n",
+      "There are two scenes from 40s to 46.13s.\n",
+      "\n",
+      "Here is the updated summary of all scenes:\n",
+      "\n",
+      "1.\n",
+      "Scene 1 (00:00 - 00:00.500000): A serene environment with a beautiful house surrounded by nature.\n",
+      "2.\n",
+      "Scene 2 (00:00.500000 - 00:03): A closer look at the house's elegant architecture and interior design.\n",
+      "3.\n",
+      "Scene 3 (00:03 - 00:06): Introduction of the \"Hello Copilot\" feature on a gradient background.\n",
+      "4.\n",
+      "Scene 4 (00:06 - 00:09): The \"Inspire me\" feature introduced, suggesting creativity aids.\n",
+      "5.\n",
+      "Scene 5 (00:09 - 00:12): A desktop view showing a picturesque landscape as the wallpaper.\n",
+      "6.\n",
+      "Scene 6 (00:12 - 00:15): Copilot's interface is shown with colorful and interactive designs.\n",
+      "7.\n",
+      "Scene 7 (00:15 - 00:18.500000): A transition with a gradient background.\n",
+      "8.\n",
+      "Scene 8 (00:18.500000 - 00:22): A closer look at the texture and materials of the Copilot device.\n",
+      "9.\n",
+      "Scene 9 (00:22 - 00:23): \"Research a topic\" feature is introduced on a gradient background.\n",
+      "10.\n",
+      "Scene 10 (00:23 - 00:26): A webpage on sustainable design appears, demonstrating the research feature.\n",
+      "11.\n",
+      "Scene 11 (00:26 - 00:29): A blurred transition scene.\n",
+      "12.\n",
+      "Scene 12 (00:29 - 00:33): The interface of an email or messaging app is displayed.\n",
+      "13.\n",
+      "Scene 13 (00:33 - 00:36): \"Organize my\" feature introduced on a gradient background.\n",
+      "14.\n",
+      "Scene 14 (00:36 - 00:39): \"Help me relax\" feature is displayed in a room with dim lights, indicating a relaxation or sleep mode.\n",
+      "15.\n",
+      "Scene 15 (00:39 - 00:43): A scene showing a modern living room, indicating the integration of Copilot in everyday living spaces.\n",
+      "16.\n",
+      "Scene 16 (00:43 - 00:46.13): Copilot logo appears, indicating it as an everyday AI companion.\n"
+     ]
+    }
+   ],
+   "source": [
+    "def download_video(sas_url, local_file_path):\n",
+    "    try:\n",
+    "        response = requests.get(sas_url, stream=True)\n",
+    "        if response.status_code == 200:\n",
+    "            with open(local_file_path, 'wb') as file:\n",
+    "                for chunk in response.iter_content(chunk_size=8192):\n",
+    "                    file.write(chunk)\n",
+    "            return True\n",
+    "        else:\n",
+    "            print(f\"Download failed with status code: {response.status_code}\")\n",
+    "            return False\n",
+    "    except Exception as e:\n",
+    "        print(f\"An error occurred during download: {e}\")\n",
+    "        return False\n",
+    "\n",
+    "def get_video_length(file_path):\n",
+    "    try:\n",
+    "        with VideoFileClip(file_path) as video:\n",
+    "            return video.duration\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error in getting video length: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "# Define the number of seconds for each segment\n",
+    "chunk_size = 20\n",
+    "# Download the video\n",
+    "local_file_path = \"downloaded_video.mp4\"\n",
+    "if download_video(video_SAS_url, local_file_path):\n",
+    "    video_length = get_video_length(local_file_path)\n",
+    "    os.remove(local_file_path)\n",
+    "\n",
+    "    if video_length is not None:\n",
+    "        print(f\"Video Length: {video_length} seconds\")\n",
+    "        sys_message = f\"\"\"\n",
+    "        The total length of the video is {video_length}s. Your task is to assist in finding all scenes in this video.\n",
+    "        You also need to describe each scene with start and end time. \n",
+    "        \"\"\"\n",
+    "        number_of_segments = int(video_length // chunk_size)\n",
+    "        updated_response = \"\"\n",
+    "        for i in range(number_of_segments + 1): # Include the last segment\n",
+    "            start_time = i * chunk_size\n",
+    "            end_time = min((i + 1) * chunk_size, video_length)\n",
+    "            user_prompt = f\"How many scenes from {start_time}s to {end_time}s?\"\n",
+    "            print(f\"Segment {i+1}: {user_prompt}\")\n",
+    "            if i > 0:\n",
+    "                user_prompt += f\"\"\"And here are scenes in the previous segments: {updated_response}. \n",
+    "                                You need to combine the scenes in the previous segments with the scenes in this segment and provide a summary.\n",
+    "                                \"\"\"\n",
+    "            \n",
+    "            response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)\n",
+    "            updated_response = response['choices'][0]['message']['content']\n",
+    "            #print(f\"Response for segment {i+1}: {updated_response}\")\n",
+    "            time.sleep(2) # Avoid throttling\n",
+    "        \n",
+    "        # Print the final response\n",
+    "        sentences = re.split(r'(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s', updated_response)\n",
+    "        for sentence in sentences:  # Print the content of the response\n",
+    "            print(sentence)\n",
+    "    else:\n",
+    "        print(\"Failed to process video length.\")\n",
+    "else:\n",
+    "    print(\"Failed to download video.\")"
+   ]
   }
  ],
  "metadata": {