|
13 | 13 | }, |
14 | 14 | { |
15 | 15 | "cell_type": "code", |
16 | | - "execution_count": null, |
| 16 | + "execution_count": 12, |
17 | 17 | "id": "f4b3d21a", |
18 | 18 | "metadata": {}, |
19 | 19 | "outputs": [], |
|
22 | 22 | "import os\n", |
23 | 23 | "import requests\n", |
24 | 24 | "import time\n", |
25 | | - "import re" |
| 25 | + "import re\n", |
| 26 | + "from moviepy.editor import VideoFileClip" |
26 | 27 | ] |
27 | 28 | }, |
28 | 29 | { |
|
38 | 39 | }, |
39 | 40 | { |
40 | 41 | "cell_type": "code", |
41 | | - "execution_count": null, |
| 42 | + "execution_count": 13, |
42 | 43 | "id": "fd85fb30", |
43 | 44 | "metadata": {}, |
44 | 45 | "outputs": [], |
|
140 | 141 | "cell_type": "markdown", |
141 | 142 | "metadata": {}, |
142 | 143 | "source": [ |
143 | | - "### Call GPT-4V" |
| 144 | + "### Define GPT-4V API Call" |
144 | 145 | ] |
145 | 146 | }, |
146 | 147 | { |
147 | 148 | "cell_type": "code", |
148 | | - "execution_count": null, |
149 | | - "id": "b6165c63", |
| 149 | + "execution_count": 14, |
150 | 150 | "metadata": {}, |
151 | 151 | "outputs": [], |
| 152 | + "source": [ |
| 153 | + "# Define GPT-4V API call\n", |
| 154 | + "def call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message):\n", |
| 155 | + " # Construct the API request URL\n", |
| 156 | + " api_url = f\"{openai_api_base}/openai/deployments/{deployment_name}/extensions/chat/completions?api-version={openai_api_version}\"\n", |
| 157 | + "\n", |
| 158 | + " # Including the api-key in HTTP headers\n", |
| 159 | + " headers = {\n", |
| 160 | + " \"Content-Type\": \"application/json\",\n", |
| 161 | + " \"api-key\": openai_api_key,\n", |
| 162 | + " }\n", |
| 163 | + "\n", |
| 164 | + " # Payload for the request\n", |
| 165 | + " payload = {\n", |
| 166 | + " \"dataSources\": [\n", |
| 167 | + " {\n", |
| 168 | + " \"type\": \"AzureComputerVisionVideoIndex\",\n", |
| 169 | + " \"parameters\": {\n", |
| 170 | + " \"computerVisionBaseUrl\": f\"{vision_api_endpoint}/computervision\",\n", |
| 171 | + " \"computerVisionApiKey\": vision_api_key,\n", |
| 172 | + " \"indexName\": video_index_name,\n", |
| 173 | + " \"videoUrls\": [video_SAS_url]\n", |
| 174 | + " }\n", |
| 175 | + " }\n", |
| 176 | + " ],\n", |
| 177 | + " \"enhancements\": {\n", |
| 178 | + " \"video\": {\n", |
| 179 | + " \"enabled\": True\n", |
| 180 | + " }\n", |
| 181 | + " },\n", |
| 182 | + " \"messages\": [\n", |
| 183 | + " {\n", |
| 184 | + " \"role\": \"system\",\n", |
| 185 | + " \"content\": [\n", |
| 186 | + " sys_message\n", |
| 187 | + " ]\n", |
| 188 | + " },\n", |
| 189 | + " {\n", |
| 190 | + " \"role\": \"user\",\n", |
| 191 | + " \"content\": [\n", |
| 192 | + " {\n", |
| 193 | + " \"acv-document-id\": video_id\n", |
| 194 | + " },\n", |
| 195 | + " ]\n", |
| 196 | + " },\n", |
| 197 | + " {\n", |
| 198 | + " \"role\": \"user\",\n", |
| 199 | + " \"content\": [\n", |
| 200 | + " user_prompt\n", |
| 201 | + " ]\n", |
| 202 | + " }, \n", |
| 203 | + " ],\n", |
| 204 | + " \"temperature\": 0.7,\n", |
| 205 | + " \"top_p\": 0.95,\n", |
| 206 | + " \"max_tokens\": 800\n", |
| 207 | + " }\n", |
| 208 | + "\n", |
| 209 | + " # Send the request and handle the response\n", |
| 210 | + " try:\n", |
| 211 | + " response = requests.post(api_url, headers=headers, json=payload)\n", |
| 212 | + " response.raise_for_status() # Raise an error for bad HTTP status codes\n", |
| 213 | + " return response.json()\n", |
| 214 | + " except requests.RequestException as e:\n", |
| 215 | + " raise SystemExit(f\"Failed to make the request. Error: {e}\")" |
| 216 | + ] |
| 217 | + }, |
| 218 | + { |
| 219 | + "cell_type": "markdown", |
| 220 | + "metadata": {}, |
| 221 | + "source": [ |
| 222 | + "### Call GPT-4V On The Entire Video" |
| 223 | + ] |
| 224 | + }, |
| 225 | + { |
| 226 | + "cell_type": "code", |
| 227 | + "execution_count": 15, |
| 228 | + "id": "b6165c63", |
| 229 | + "metadata": {}, |
| 230 | + "outputs": [ |
| 231 | + { |
| 232 | + "name": "stdout", |
| 233 | + "output_type": "stream", |
| 234 | + "text": [ |
| 235 | + "The advertisement video highlights the features of Microsoft's new virtual assistant, Copilot.\n", |
| 236 | + "Throughout the video, the assistant is shown helping users in various ways, such as creating pictures, exploring variations, reading, and organizing plans.\n", |
| 237 | + "The ad uses colorful, serene, and visually pleasing backgrounds to convey a sense of calmness and creativity.\n", |
| 238 | + "It demonstrates how Copilot can assist in enhancing the user’s digital experience, making it more organized, creative, and productive.\n", |
| 239 | + "The main message is to introduce and promote Copilot as a helpful and reliable assistant that makes the user's digital interaction easier and more enjoyable.\n" |
| 240 | + ] |
| 241 | + } |
| 242 | + ], |
152 | 243 | "source": [ |
153 | 244 | "# System messages and user prompt\n", |
154 | 245 | "sys_message = \"\"\"\n", |
|
160 | 251 | "\"\"\"\n", |
161 | 252 | "user_prompt = \"Summarize the ad video\"\n", |
162 | 253 | "\n", |
163 | | - "# Construct the API request URL\n", |
164 | | - "api_url = f\"{openai_api_base}/openai/deployments/{deployment_name}/extensions/chat/completions?api-version={openai_api_version}\"\n", |
165 | | - "\n", |
166 | | - "# Including the api-key in HTTP headers\n", |
167 | | - "headers = {\n", |
168 | | - " \"Content-Type\": \"application/json\",\n", |
169 | | - " \"api-key\": openai_api_key,\n", |
170 | | - "}\n", |
171 | | - "\n", |
172 | | - "# Payload for the request\n", |
173 | | - "payload = {\n", |
174 | | - " \"dataSources\": [\n", |
175 | | - " {\n", |
176 | | - " \"type\": \"AzureComputerVisionVideoIndex\",\n", |
177 | | - " \"parameters\": {\n", |
178 | | - " \"computerVisionBaseUrl\": f\"{vision_api_endpoint}/computervision\",\n", |
179 | | - " \"computerVisionApiKey\": vision_api_key,\n", |
180 | | - " \"indexName\": video_index_name,\n", |
181 | | - " \"videoUrls\": [video_SAS_url]\n", |
182 | | - " }\n", |
183 | | - " }\n", |
184 | | - " ],\n", |
185 | | - " \"enhancements\": {\n", |
186 | | - " \"video\": {\n", |
187 | | - " \"enabled\": True\n", |
188 | | - " }\n", |
189 | | - " },\n", |
190 | | - " \"messages\": [\n", |
191 | | - " {\n", |
192 | | - " \"role\": \"system\",\n", |
193 | | - " \"content\": [\n", |
194 | | - " sys_message\n", |
195 | | - " ]\n", |
196 | | - " },\n", |
197 | | - " {\n", |
198 | | - " \"role\": \"user\",\n", |
199 | | - " \"content\": [\n", |
200 | | - " {\n", |
201 | | - " \"acv-document-id\": video_id\n", |
202 | | - " },\n", |
203 | | - " ]\n", |
204 | | - " },\n", |
205 | | - " {\n", |
206 | | - " \"role\": \"user\",\n", |
207 | | - " \"content\": [\n", |
208 | | - " user_prompt\n", |
209 | | - " ]\n", |
210 | | - " }, \n", |
211 | | - " ],\n", |
212 | | - " \"temperature\": 0.7,\n", |
213 | | - " \"top_p\": 0.95,\n", |
214 | | - " \"max_tokens\": 800\n", |
215 | | - "}\n", |
216 | | - "\n", |
217 | | - "# Send the request and handle the response\n", |
| 254 | + "# Call GPT-4V API and print the response\n", |
218 | 255 | "try:\n", |
219 | | - " response = requests.post(api_url, headers=headers, json=payload)\n", |
220 | | - " response.raise_for_status() # Raise an error for bad HTTP status codes\n", |
221 | | - " response_content = response.json()\n", |
222 | | - " text = response_content['choices'][0]['message']['content']\n", |
| 256 | + " response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)\n", |
| 257 | + " text = response['choices'][0]['message']['content']\n", |
223 | 258 | " sentences = re.split(r'(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s', text)\n", |
224 | 259 | " for sentence in sentences: # Print the content of the response\n", |
225 | 260 | " print(sentence)\n", |
226 | 261 | "except requests.RequestException as e:\n", |
227 | 262 | " raise SystemExit(f\"Failed to make the request. Error: {e}\")" |
228 | 263 | ] |
229 | 264 | }, |
| 265 | + { |
| 266 | + "cell_type": "markdown", |
| 267 | + "metadata": {}, |
| 268 | + "source": [ |
| 269 | + "### Call GPT-4V On Each Video Chunk Sequentially" |
| 270 | + ] |
| 271 | + }, |
230 | 272 | { |
231 | 273 | "cell_type": "code", |
232 | | - "execution_count": null, |
| 274 | + "execution_count": 18, |
233 | 275 | "metadata": {}, |
234 | | - "outputs": [], |
235 | | - "source": [] |
| 276 | + "outputs": [ |
| 277 | + { |
| 278 | + "name": "stdout", |
| 279 | + "output_type": "stream", |
| 280 | + "text": [ |
| 281 | + "Video Length: 46.13 seconds\n", |
| 282 | + "Segment 1: How many scenes from 0s to 20s?\n", |
| 283 | + "Segment 2: How many scenes from 20s to 40s?\n", |
| 284 | + "Segment 3: How many scenes from 40s to 46.13s?\n", |
| 285 | + "There are two scenes from 40s to 46.13s.\n", |
| 286 | + "\n", |
| 287 | + "Here is the updated summary of all scenes:\n", |
| 288 | + "\n", |
| 289 | + "1.\n", |
| 290 | + "Scene 1 (00:00 - 00:00.500000): A serene environment with a beautiful house surrounded by nature.\n", |
| 291 | + "2.\n", |
| 292 | + "Scene 2 (00:00.500000 - 00:03): A closer look at the house's elegant architecture and interior design.\n", |
| 293 | + "3.\n", |
| 294 | + "Scene 3 (00:03 - 00:06): Introduction of the \"Hello Copilot\" feature on a gradient background.\n", |
| 295 | + "4.\n", |
| 296 | + "Scene 4 (00:06 - 00:09): The \"Inspire me\" feature introduced, suggesting creativity aids.\n", |
| 297 | + "5.\n", |
| 298 | + "Scene 5 (00:09 - 00:12): A desktop view showing a picturesque landscape as the wallpaper.\n", |
| 299 | + "6.\n", |
| 300 | + "Scene 6 (00:12 - 00:15): Copilot's interface is shown with colorful and interactive designs.\n", |
| 301 | + "7.\n", |
| 302 | + "Scene 7 (00:15 - 00:18.500000): A transition with a gradient background.\n", |
| 303 | + "8.\n", |
| 304 | + "Scene 8 (00:18.500000 - 00:22): A closer look at the texture and materials of the Copilot device.\n", |
| 305 | + "9.\n", |
| 306 | + "Scene 9 (00:22 - 00:23): \"Research a topic\" feature is introduced on a gradient background.\n", |
| 307 | + "10.\n", |
| 308 | + "Scene 10 (00:23 - 00:26): A webpage on sustainable design appears, demonstrating the research feature.\n", |
| 309 | + "11.\n", |
| 310 | + "Scene 11 (00:26 - 00:29): A blurred transition scene.\n", |
| 311 | + "12.\n", |
| 312 | + "Scene 12 (00:29 - 00:33): The interface of an email or messaging app is displayed.\n", |
| 313 | + "13.\n", |
| 314 | + "Scene 13 (00:33 - 00:36): \"Organize my\" feature introduced on a gradient background.\n", |
| 315 | + "14.\n", |
| 316 | + "Scene 14 (00:36 - 00:39): \"Help me relax\" feature is displayed in a room with dim lights, indicating a relaxation or sleep mode.\n", |
| 317 | + "15.\n", |
| 318 | + "Scene 15 (00:39 - 00:43): A scene showing a modern living room, indicating the integration of Copilot in everyday living spaces.\n", |
| 319 | + "16.\n", |
| 320 | + "Scene 16 (00:43 - 00:46.13): Copilot logo appears, indicating it as an everyday AI companion.\n" |
| 321 | + ] |
| 322 | + } |
| 323 | + ], |
| 324 | + "source": [ |
| 325 | + "def download_video(sas_url, local_file_path):\n", |
| 326 | + " try:\n", |
| 327 | + " response = requests.get(sas_url, stream=True)\n", |
| 328 | + " if response.status_code == 200:\n", |
| 329 | + " with open(local_file_path, 'wb') as file:\n", |
| 330 | + " for chunk in response.iter_content(chunk_size=8192):\n", |
| 331 | + " file.write(chunk)\n", |
| 332 | + " return True\n", |
| 333 | + " else:\n", |
| 334 | + " print(f\"Download failed with status code: {response.status_code}\")\n", |
| 335 | + " return False\n", |
| 336 | + " except Exception as e:\n", |
| 337 | + " print(f\"An error occurred during download: {e}\")\n", |
| 338 | + " return False\n", |
| 339 | + "\n", |
| 340 | + "def get_video_length(file_path):\n", |
| 341 | + " try:\n", |
| 342 | + " with VideoFileClip(file_path) as video:\n", |
| 343 | + " return video.duration\n", |
| 344 | + " except Exception as e:\n", |
| 345 | + " print(f\"Error in getting video length: {e}\")\n", |
| 346 | + " return None\n", |
| 347 | + "\n", |
| 348 | + "# Define the number of seconds for each segment\n", |
| 349 | + "chunk_size = 20\n", |
| 350 | + "# Download the video\n", |
| 351 | + "local_file_path = \"downloaded_video.mp4\"\n", |
| 352 | + "if download_video(video_SAS_url, local_file_path):\n", |
| 353 | + " video_length = get_video_length(local_file_path)\n", |
| 354 | + " os.remove(local_file_path)\n", |
| 355 | + "\n", |
| 356 | + " if video_length is not None:\n", |
| 357 | + " print(f\"Video Length: {video_length} seconds\")\n", |
| 358 | + " sys_message = f\"\"\"\n", |
| 359 | + " The total length of the video is {video_length}s. Your task is to assist in finding all scenes in this video.\n", |
| 360 | + " You also need to describe each scene with start and end time. \n", |
| 361 | + " \"\"\"\n", |
| 362 | + " number_of_segments = int(video_length // chunk_size)\n", |
| 363 | + " updated_response = \"\"\n", |
| 364 | + " for i in range(number_of_segments + 1): # Include the last segment\n", |
| 365 | + " start_time = i * chunk_size\n", |
| 366 | + " end_time = min((i + 1) * chunk_size, video_length)\n", |
| 367 | + " user_prompt = f\"How many scenes from {start_time}s to {end_time}s?\"\n", |
| 368 | + " print(f\"Segment {i+1}: {user_prompt}\")\n", |
| 369 | + " if i > 0:\n", |
| 370 | + " user_prompt += f\"\"\"And here are scenes in the previous segments: {updated_response}. \n", |
| 371 | + " You need to combine the scenes in the previous segments with the scenes in this segment and provide a summary.\n", |
| 372 | + " \"\"\"\n", |
| 373 | + " \n", |
| 374 | + " response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)\n", |
| 375 | + " updated_response = response['choices'][0]['message']['content']\n", |
| 376 | + " #print(f\"Response for segment {i+1}: {updated_response}\")\n", |
| 377 | + " time.sleep(2) # Avoid throttling\n", |
| 378 | + " \n", |
| 379 | + " # Print the final response\n", |
| 380 | + " sentences = re.split(r'(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s', updated_response)\n", |
| 381 | + " for sentence in sentences: # Print the content of the response\n", |
| 382 | + " print(sentence)\n", |
| 383 | + " else:\n", |
| 384 | + " print(\"Failed to process video length.\")\n", |
| 385 | + "else:\n", |
| 386 | + " print(\"Failed to download video.\")" |
| 387 | + ] |
236 | 388 | } |
237 | 389 | ], |
238 | 390 | "metadata": { |
|
0 commit comments