Skip to content

Commit 8a86f22

Browse files
author
Zhi Zhou
committed
Add Call GPT-4V On Each Video Chunk Sequentially cell in video sample
1 parent 47c06ce commit 8a86f22

File tree

2 files changed

+222
-69
lines changed

2 files changed

+222
-69
lines changed

Basic_Samples/GPT-4V/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ jupyter
44
# Other packages needed to run the notebook samples
55
requests
66
pillow
7-
matplotlib
7+
matplotlib
8+
moviepy

Basic_Samples/GPT-4V/video_chatcompletions_example_restapi.ipynb

Lines changed: 220 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
},
1414
{
1515
"cell_type": "code",
16-
"execution_count": null,
16+
"execution_count": 12,
1717
"id": "f4b3d21a",
1818
"metadata": {},
1919
"outputs": [],
@@ -22,7 +22,8 @@
2222
"import os\n",
2323
"import requests\n",
2424
"import time\n",
25-
"import re"
25+
"import re\n",
26+
"from moviepy.editor import VideoFileClip"
2627
]
2728
},
2829
{
@@ -38,7 +39,7 @@
3839
},
3940
{
4041
"cell_type": "code",
41-
"execution_count": null,
42+
"execution_count": 13,
4243
"id": "fd85fb30",
4344
"metadata": {},
4445
"outputs": [],
@@ -140,15 +141,105 @@
140141
"cell_type": "markdown",
141142
"metadata": {},
142143
"source": [
143-
"### Call GPT-4V"
144+
"### Define GPT-4V API Call"
144145
]
145146
},
146147
{
147148
"cell_type": "code",
148-
"execution_count": null,
149-
"id": "b6165c63",
149+
"execution_count": 14,
150150
"metadata": {},
151151
"outputs": [],
152+
"source": [
153+
"# Define GPT-4V API call\n",
154+
"def call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message):\n",
155+
" # Construct the API request URL\n",
156+
" api_url = f\"{openai_api_base}/openai/deployments/{deployment_name}/extensions/chat/completions?api-version={openai_api_version}\"\n",
157+
"\n",
158+
" # Including the api-key in HTTP headers\n",
159+
" headers = {\n",
160+
" \"Content-Type\": \"application/json\",\n",
161+
" \"api-key\": openai_api_key,\n",
162+
" }\n",
163+
"\n",
164+
" # Payload for the request\n",
165+
" payload = {\n",
166+
" \"dataSources\": [\n",
167+
" {\n",
168+
" \"type\": \"AzureComputerVisionVideoIndex\",\n",
169+
" \"parameters\": {\n",
170+
" \"computerVisionBaseUrl\": f\"{vision_api_endpoint}/computervision\",\n",
171+
" \"computerVisionApiKey\": vision_api_key,\n",
172+
" \"indexName\": video_index_name,\n",
173+
" \"videoUrls\": [video_SAS_url]\n",
174+
" }\n",
175+
" }\n",
176+
" ],\n",
177+
" \"enhancements\": {\n",
178+
" \"video\": {\n",
179+
" \"enabled\": True\n",
180+
" }\n",
181+
" },\n",
182+
" \"messages\": [\n",
183+
" {\n",
184+
" \"role\": \"system\",\n",
185+
" \"content\": [\n",
186+
" sys_message\n",
187+
" ]\n",
188+
" },\n",
189+
" {\n",
190+
" \"role\": \"user\",\n",
191+
" \"content\": [\n",
192+
" {\n",
193+
" \"acv-document-id\": video_id\n",
194+
" },\n",
195+
" ]\n",
196+
" },\n",
197+
" {\n",
198+
" \"role\": \"user\",\n",
199+
" \"content\": [\n",
200+
" user_prompt\n",
201+
" ]\n",
202+
" }, \n",
203+
" ],\n",
204+
" \"temperature\": 0.7,\n",
205+
" \"top_p\": 0.95,\n",
206+
" \"max_tokens\": 800\n",
207+
" }\n",
208+
"\n",
209+
" # Send the request and handle the response\n",
210+
" try:\n",
211+
" response = requests.post(api_url, headers=headers, json=payload)\n",
212+
" response.raise_for_status() # Raise an error for bad HTTP status codes\n",
213+
" return response.json()\n",
214+
" except requests.RequestException as e:\n",
215+
" raise SystemExit(f\"Failed to make the request. Error: {e}\")"
216+
]
217+
},
218+
{
219+
"cell_type": "markdown",
220+
"metadata": {},
221+
"source": [
222+
"### Call GPT-4V On The Entire Video"
223+
]
224+
},
225+
{
226+
"cell_type": "code",
227+
"execution_count": 15,
228+
"id": "b6165c63",
229+
"metadata": {},
230+
"outputs": [
231+
{
232+
"name": "stdout",
233+
"output_type": "stream",
234+
"text": [
235+
"The advertisement video highlights the features of Microsoft's new virtual assistant, Copilot.\n",
236+
"Throughout the video, the assistant is shown helping users in various ways, such as creating pictures, exploring variations, reading, and organizing plans.\n",
237+
"The ad uses colorful, serene, and visually pleasing backgrounds to convey a sense of calmness and creativity.\n",
238+
"It demonstrates how Copilot can assist in enhancing the user’s digital experience, making it more organized, creative, and productive.\n",
239+
"The main message is to introduce and promote Copilot as a helpful and reliable assistant that makes the user's digital interaction easier and more enjoyable.\n"
240+
]
241+
}
242+
],
152243
"source": [
153244
"# System messages and user prompt\n",
154245
"sys_message = \"\"\"\n",
@@ -160,79 +251,140 @@
160251
"\"\"\"\n",
161252
"user_prompt = \"Summarize the ad video\"\n",
162253
"\n",
163-
"# Construct the API request URL\n",
164-
"api_url = f\"{openai_api_base}/openai/deployments/{deployment_name}/extensions/chat/completions?api-version={openai_api_version}\"\n",
165-
"\n",
166-
"# Including the api-key in HTTP headers\n",
167-
"headers = {\n",
168-
" \"Content-Type\": \"application/json\",\n",
169-
" \"api-key\": openai_api_key,\n",
170-
"}\n",
171-
"\n",
172-
"# Payload for the request\n",
173-
"payload = {\n",
174-
" \"dataSources\": [\n",
175-
" {\n",
176-
" \"type\": \"AzureComputerVisionVideoIndex\",\n",
177-
" \"parameters\": {\n",
178-
" \"computerVisionBaseUrl\": f\"{vision_api_endpoint}/computervision\",\n",
179-
" \"computerVisionApiKey\": vision_api_key,\n",
180-
" \"indexName\": video_index_name,\n",
181-
" \"videoUrls\": [video_SAS_url]\n",
182-
" }\n",
183-
" }\n",
184-
" ],\n",
185-
" \"enhancements\": {\n",
186-
" \"video\": {\n",
187-
" \"enabled\": True\n",
188-
" }\n",
189-
" },\n",
190-
" \"messages\": [\n",
191-
" {\n",
192-
" \"role\": \"system\",\n",
193-
" \"content\": [\n",
194-
" sys_message\n",
195-
" ]\n",
196-
" },\n",
197-
" {\n",
198-
" \"role\": \"user\",\n",
199-
" \"content\": [\n",
200-
" {\n",
201-
" \"acv-document-id\": video_id\n",
202-
" },\n",
203-
" ]\n",
204-
" },\n",
205-
" {\n",
206-
" \"role\": \"user\",\n",
207-
" \"content\": [\n",
208-
" user_prompt\n",
209-
" ]\n",
210-
" }, \n",
211-
" ],\n",
212-
" \"temperature\": 0.7,\n",
213-
" \"top_p\": 0.95,\n",
214-
" \"max_tokens\": 800\n",
215-
"}\n",
216-
"\n",
217-
"# Send the request and handle the response\n",
254+
"# Call GPT-4V API and print the response\n",
218255
"try:\n",
219-
" response = requests.post(api_url, headers=headers, json=payload)\n",
220-
" response.raise_for_status() # Raise an error for bad HTTP status codes\n",
221-
" response_content = response.json()\n",
222-
" text = response_content['choices'][0]['message']['content']\n",
256+
" response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)\n",
257+
" text = response['choices'][0]['message']['content']\n",
223258
" sentences = re.split(r'(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s', text)\n",
224259
" for sentence in sentences: # Print the content of the response\n",
225260
" print(sentence)\n",
226261
"except requests.RequestException as e:\n",
227262
" raise SystemExit(f\"Failed to make the request. Error: {e}\")"
228263
]
229264
},
265+
{
266+
"cell_type": "markdown",
267+
"metadata": {},
268+
"source": [
269+
"### Call GPT-4V On Each Video Chunk Sequentially"
270+
]
271+
},
230272
{
231273
"cell_type": "code",
232-
"execution_count": null,
274+
"execution_count": 18,
233275
"metadata": {},
234-
"outputs": [],
235-
"source": []
276+
"outputs": [
277+
{
278+
"name": "stdout",
279+
"output_type": "stream",
280+
"text": [
281+
"Video Length: 46.13 seconds\n",
282+
"Segment 1: How many scenes from 0s to 20s?\n",
283+
"Segment 2: How many scenes from 20s to 40s?\n",
284+
"Segment 3: How many scenes from 40s to 46.13s?\n",
285+
"There are two scenes from 40s to 46.13s.\n",
286+
"\n",
287+
"Here is the updated summary of all scenes:\n",
288+
"\n",
289+
"1.\n",
290+
"Scene 1 (00:00 - 00:00.500000): A serene environment with a beautiful house surrounded by nature.\n",
291+
"2.\n",
292+
"Scene 2 (00:00.500000 - 00:03): A closer look at the house's elegant architecture and interior design.\n",
293+
"3.\n",
294+
"Scene 3 (00:03 - 00:06): Introduction of the \"Hello Copilot\" feature on a gradient background.\n",
295+
"4.\n",
296+
"Scene 4 (00:06 - 00:09): The \"Inspire me\" feature introduced, suggesting creativity aids.\n",
297+
"5.\n",
298+
"Scene 5 (00:09 - 00:12): A desktop view showing a picturesque landscape as the wallpaper.\n",
299+
"6.\n",
300+
"Scene 6 (00:12 - 00:15): Copilot's interface is shown with colorful and interactive designs.\n",
301+
"7.\n",
302+
"Scene 7 (00:15 - 00:18.500000): A transition with a gradient background.\n",
303+
"8.\n",
304+
"Scene 8 (00:18.500000 - 00:22): A closer look at the texture and materials of the Copilot device.\n",
305+
"9.\n",
306+
"Scene 9 (00:22 - 00:23): \"Research a topic\" feature is introduced on a gradient background.\n",
307+
"10.\n",
308+
"Scene 10 (00:23 - 00:26): A webpage on sustainable design appears, demonstrating the research feature.\n",
309+
"11.\n",
310+
"Scene 11 (00:26 - 00:29): A blurred transition scene.\n",
311+
"12.\n",
312+
"Scene 12 (00:29 - 00:33): The interface of an email or messaging app is displayed.\n",
313+
"13.\n",
314+
"Scene 13 (00:33 - 00:36): \"Organize my\" feature introduced on a gradient background.\n",
315+
"14.\n",
316+
"Scene 14 (00:36 - 00:39): \"Help me relax\" feature is displayed in a room with dim lights, indicating a relaxation or sleep mode.\n",
317+
"15.\n",
318+
"Scene 15 (00:39 - 00:43): A scene showing a modern living room, indicating the integration of Copilot in everyday living spaces.\n",
319+
"16.\n",
320+
"Scene 16 (00:43 - 00:46.13): Copilot logo appears, indicating it as an everyday AI companion.\n"
321+
]
322+
}
323+
],
324+
"source": [
325+
"def download_video(sas_url, local_file_path):\n",
326+
" try:\n",
327+
" response = requests.get(sas_url, stream=True)\n",
328+
" if response.status_code == 200:\n",
329+
" with open(local_file_path, 'wb') as file:\n",
330+
" for chunk in response.iter_content(chunk_size=8192):\n",
331+
" file.write(chunk)\n",
332+
" return True\n",
333+
" else:\n",
334+
" print(f\"Download failed with status code: {response.status_code}\")\n",
335+
" return False\n",
336+
" except Exception as e:\n",
337+
" print(f\"An error occurred during download: {e}\")\n",
338+
" return False\n",
339+
"\n",
340+
"def get_video_length(file_path):\n",
341+
" try:\n",
342+
" with VideoFileClip(file_path) as video:\n",
343+
" return video.duration\n",
344+
" except Exception as e:\n",
345+
" print(f\"Error in getting video length: {e}\")\n",
346+
" return None\n",
347+
"\n",
348+
"# Define the number of seconds for each segment\n",
349+
"chunk_size = 20\n",
350+
"# Download the video\n",
351+
"local_file_path = \"downloaded_video.mp4\"\n",
352+
"if download_video(video_SAS_url, local_file_path):\n",
353+
" video_length = get_video_length(local_file_path)\n",
354+
" os.remove(local_file_path)\n",
355+
"\n",
356+
" if video_length is not None:\n",
357+
" print(f\"Video Length: {video_length} seconds\")\n",
358+
" sys_message = f\"\"\"\n",
359+
" The total length of the video is {video_length}s. Your task is to assist in finding all scenes in this video.\n",
360+
" You also need to describe each scene with start and end time. \n",
361+
" \"\"\"\n",
362+
" number_of_segments = int(video_length // chunk_size)\n",
363+
" updated_response = \"\"\n",
364+
" for i in range(number_of_segments + 1): # Include the last segment\n",
365+
" start_time = i * chunk_size\n",
366+
" end_time = min((i + 1) * chunk_size, video_length)\n",
367+
" user_prompt = f\"How many scenes from {start_time}s to {end_time}s?\"\n",
368+
" print(f\"Segment {i+1}: {user_prompt}\")\n",
369+
" if i > 0:\n",
370+
" user_prompt += f\"\"\"And here are scenes in the previous segments: {updated_response}. \n",
371+
" You need to combine the scenes in the previous segments with the scenes in this segment and provide a summary.\n",
372+
" \"\"\"\n",
373+
" \n",
374+
" response = call_GPT4V(vision_api_endpoint, vision_api_key, video_index_name, video_id, user_prompt, sys_message)\n",
375+
" updated_response = response['choices'][0]['message']['content']\n",
376+
" #print(f\"Response for segment {i+1}: {updated_response}\")\n",
377+
" time.sleep(2) # Avoid throttling\n",
378+
" \n",
379+
" # Print the final response\n",
380+
" sentences = re.split(r'(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s', updated_response)\n",
381+
" for sentence in sentences: # Print the content of the response\n",
382+
" print(sentence)\n",
383+
" else:\n",
384+
" print(\"Failed to process video length.\")\n",
385+
"else:\n",
386+
" print(\"Failed to download video.\")"
387+
]
236388
}
237389
],
238390
"metadata": {

0 commit comments

Comments
 (0)