Skip to content

Commit 4b1b6a8

Browse files
author
Andrey Vykhodtsev
committed
notebook changes
1 parent d764baa commit 4b1b6a8

12 files changed

+1241
-979
lines changed

01-Load-Data-ACogSearch.ipynb

Lines changed: 57 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
},
104104
{
105105
"cell_type": "code",
106-
"execution_count": 4,
106+
"execution_count": 5,
107107
"metadata": {},
108108
"outputs": [
109109
{
@@ -255,51 +255,51 @@
255255
" }\n",
256256
" ]\n",
257257
" },\n",
258-
" {\n",
259-
" \"@odata.type\": \"#Microsoft.Skills.Text.KeyPhraseExtractionSkill\",\n",
260-
" \"context\": \"/document/pages/*\",\n",
261-
" \"maxKeyPhraseCount\": 2,\n",
262-
" \"defaultLanguageCode\": \"en\",\n",
263-
" \"inputs\": [\n",
264-
" {\n",
265-
" \"name\": \"text\", \n",
266-
" \"source\": \"/document/pages/*\"\n",
267-
" }\n",
268-
" ],\n",
269-
" \"outputs\": [\n",
270-
" {\n",
271-
" \"name\": \"keyPhrases\",\n",
272-
" \"targetName\": \"keyPhrases\"\n",
273-
" }\n",
274-
" ]\n",
275-
" },\n",
276-
" {\n",
277-
" \"@odata.type\": \"#Microsoft.Skills.Text.V3.EntityRecognitionSkill\",\n",
278-
" \"context\": \"/document/pages/*\",\n",
279-
" \"categories\": [\"Person\", \"URL\", \"Email\"],\n",
280-
" \"minimumPrecision\": 0.5, \n",
281-
" \"defaultLanguageCode\": \"en\",\n",
282-
" \"inputs\": [\n",
283-
" {\n",
284-
" \"name\": \"text\", \n",
285-
" \"source\":\"/document/pages/*\"\n",
286-
" }\n",
287-
" ],\n",
288-
" \"outputs\": [\n",
289-
" {\n",
290-
" \"name\": \"persons\", \n",
291-
" \"targetName\": \"persons\"\n",
292-
" },\n",
293-
" {\n",
294-
" \"name\": \"urls\", \n",
295-
" \"targetName\": \"urls\"\n",
296-
" },\n",
297-
" {\n",
298-
" \"name\": \"emails\", \n",
299-
" \"targetName\": \"emails\"\n",
300-
" }\n",
301-
" ]\n",
302-
" }\n",
258+
" # {\n",
259+
" # \"@odata.type\": \"#Microsoft.Skills.Text.KeyPhraseExtractionSkill\",\n",
260+
" # \"context\": \"/document/pages/*\",\n",
261+
" # \"maxKeyPhraseCount\": 2,\n",
262+
" # \"defaultLanguageCode\": \"en\",\n",
263+
" # \"inputs\": [\n",
264+
" # {\n",
265+
" # \"name\": \"text\", \n",
266+
" # \"source\": \"/document/pages/*\"\n",
267+
" # }\n",
268+
" # ],\n",
269+
" # \"outputs\": [\n",
270+
" # {\n",
271+
" # \"name\": \"keyPhrases\",\n",
272+
" # \"targetName\": \"keyPhrases\"\n",
273+
" # }\n",
274+
" # ]\n",
275+
" # },\n",
276+
" # {\n",
277+
" # \"@odata.type\": \"#Microsoft.Skills.Text.V3.EntityRecognitionSkill\",\n",
278+
" # \"context\": \"/document/pages/*\",\n",
279+
" # \"categories\": [\"Person\", \"URL\", \"Email\"],\n",
280+
" # \"minimumPrecision\": 0.5, \n",
281+
" # \"defaultLanguageCode\": \"en\",\n",
282+
" # \"inputs\": [\n",
283+
" # {\n",
284+
" # \"name\": \"text\", \n",
285+
" # \"source\":\"/document/pages/*\"\n",
286+
" # }\n",
287+
" # ],\n",
288+
" # \"outputs\": [\n",
289+
" # {\n",
290+
" # \"name\": \"persons\", \n",
291+
" # \"targetName\": \"persons\"\n",
292+
" # },\n",
293+
" # {\n",
294+
" # \"name\": \"urls\", \n",
295+
" # \"targetName\": \"urls\"\n",
296+
" # },\n",
297+
" # {\n",
298+
" # \"name\": \"emails\", \n",
299+
" # \"targetName\": \"emails\"\n",
300+
" # }\n",
301+
" # ]\n",
302+
" # }\n",
303303
" ],\n",
304304
" \"cognitiveServices\": {\n",
305305
" \"@odata.type\": \"#Microsoft.Azure.Search.CognitiveServicesByKey\",\n",
@@ -548,7 +548,7 @@
548548
},
549549
{
550550
"cell_type": "code",
551-
"execution_count": 15,
551+
"execution_count": 19,
552552
"metadata": {
553553
"tags": []
554554
},
@@ -557,10 +557,7 @@
557557
"name": "stdout",
558558
"output_type": "stream",
559559
"text": [
560-
"200\n",
561-
"Status: inProgress\n",
562-
"Items Processed: 400\n",
563-
"True\n"
560+
"{'status': 'inProgress', 'statusDetail': None, 'errorMessage': None, 'startTime': '2024-01-28T10:33:09.321Z', 'endTime': None, 'itemsProcessed': 4000, 'itemsFailed': 0, 'initialTrackingState': None, 'finalTrackingState': '{\\r\\n \"lastFullEnumerationStartTime\": \"0001-01-01T00:00:00Z\",\\r\\n \"lastAttemptedEnumerationStartTime\": \"2024-01-28T10:33:09.602Z\",\\r\\n \"nameHighWaterMark\": \"https://datasetsgptsmartsearch.blob.core.windows.net/arxivcs/pdf/0501/0501020v1.pdf\"\\r\\n}', 'mode': 'indexingAllDocs', 'errors': [], 'warnings': [{'key': 'localId=0402023v1.pdf&documentKey=https%3a%2f%2fdatasetsgptsmartsearch.blob.core.windows.net%2farxivcs%2fpdf%2f0402%2f0402023v1.pdf', 'name': 'DocumentExtraction.azureblob.0402023v1.pdf', 'message': 'Could not extract content or metadata from your document. ', 'details': \"Document has unsupported content type 'application/x-gtar'. Blob metadata was indexed, but content extraction was skipped.\", 'documentationLink': 'https://go.microsoft.com/fwlink/?linkid=2104227'}, {'key': 'localId=https%3a%2f%2fdatasetsgptsmartsearch.blob.core.windows.net%2farxivcs%2fpdf%2f0402%2f0402023v1.pdf&documentKey=https%3a%2f%2fdatasetsgptsmartsearch.blob.core.windows.net%2farxivcs%2fpdf%2f0402%2f0402023v1.pdf', 'name': 'Enrichment.SplitSkill.#3', 'message': 'Could not execute skill because one or more skill input was invalid.', 'details': \"Required skill input is missing or empty. Name: 'text', Source: '$(/document/merged_text)'.\", 'documentationLink': 'https://go.microsoft.com/fwlink/?linkid=2106385'}], 'metrics': None}\n"
564561
]
565562
}
566563
],
@@ -570,10 +567,11 @@
570567
" r = requests.get(os.environ['AZURE_SEARCH_ENDPOINT'] + \"/indexers/\" + indexer_name +\n",
571568
" \"/status\", headers=headers, params=params)\n",
572569
" # pprint(json.dumps(r.json(), indent=1))\n",
573-
" print(r.status_code)\n",
574-
" print(\"Status:\",r.json().get('lastResult').get('status'))\n",
575-
" print(\"Items Processed:\",r.json().get('lastResult').get('itemsProcessed'))\n",
576-
" print(r.ok)\n",
570+
" #print(r.status_code)\n",
571+
" #print(\"Status:\",r.json().get('lastResult').get('status'))\n",
572+
" #print(\"Items Processed:\",r.json().get('lastResult').get('itemsProcessed'))\n",
573+
" #print(r.ok)\n",
574+
" print(r.json().get('lastResult'))\n",
577575
" \n",
578576
"except Exception as e:\n",
579577
" print(\"Wait a few seconds until the process starts and run this cell again.\")"
@@ -613,7 +611,7 @@
613611
},
614612
{
615613
"cell_type": "code",
616-
"execution_count": 16,
614+
"execution_count": 17,
617615
"metadata": {},
618616
"outputs": [
619617
{
@@ -695,9 +693,9 @@
695693
],
696694
"metadata": {
697695
"kernelspec": {
698-
"display_name": "Python 3.10 - SDK v2",
696+
"display_name": ".venv",
699697
"language": "python",
700-
"name": "python310-sdkv2"
698+
"name": "python3"
701699
},
702700
"language_info": {
703701
"codemirror_mode": {
@@ -709,12 +707,7 @@
709707
"name": "python",
710708
"nbconvert_exporter": "python",
711709
"pygments_lexer": "ipython3",
712-
"version": "3.10.11"
713-
},
714-
"vscode": {
715-
"interpreter": {
716-
"hash": "9ff083f0c83558f9261023d47a77b9b3eb892c62cdbe066d046abcad1a5edb5c"
717-
}
710+
"version": "3.11.7"
718711
}
719712
},
720713
"nbformat": 4,

02-LoadCSVOneToMany-ACogSearch.ipynb

Lines changed: 64 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
"name": "stdout",
7777
"output_type": "stream",
7878
"text": [
79-
"201\n",
79+
"204\n",
8080
"True\n"
8181
]
8282
}
@@ -123,7 +123,22 @@
123123
"execution_count": 5,
124124
"id": "2fbbbd0d-3015-4601-9ef1-7008ad168167",
125125
"metadata": {},
126-
"outputs": [],
126+
"outputs": [
127+
{
128+
"name": "stderr",
129+
"output_type": "stream",
130+
"text": [
131+
"/var/folders/mf/1n_x1d_51fs2m4_6tj03p9jm0000gn/T/ipykernel_89913/3169803804.py:2: DeprecationWarning: \n",
132+
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
133+
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
134+
"but was not found to be installed on your system.\n",
135+
"If this would cause problems for you,\n",
136+
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
137+
" \n",
138+
" import pandas as pd\n"
139+
]
140+
}
141+
],
127142
"source": [
128143
"#Download the csv files to disk and inspect using pandas\n",
129144
"import pandas as pd\n",
@@ -148,69 +163,69 @@
148163
"text/html": [
149164
"<style type=\"text/css\">\n",
150165
"</style>\n",
151-
"<table id=\"T_87464\">\n",
166+
"<table id=\"T_18016\">\n",
152167
" <thead>\n",
153168
" <tr>\n",
154169
" <th class=\"blank level0\" >&nbsp;</th>\n",
155-
" <th id=\"T_87464_level0_col0\" class=\"col_heading level0 col0\" >cord_uid</th>\n",
156-
" <th id=\"T_87464_level0_col1\" class=\"col_heading level0 col1\" >source_x</th>\n",
157-
" <th id=\"T_87464_level0_col2\" class=\"col_heading level0 col2\" >title</th>\n",
158-
" <th id=\"T_87464_level0_col3\" class=\"col_heading level0 col3\" >abstract</th>\n",
159-
" <th id=\"T_87464_level0_col4\" class=\"col_heading level0 col4\" >authors</th>\n",
160-
" <th id=\"T_87464_level0_col5\" class=\"col_heading level0 col5\" >url</th>\n",
170+
" <th id=\"T_18016_level0_col0\" class=\"col_heading level0 col0\" >cord_uid</th>\n",
171+
" <th id=\"T_18016_level0_col1\" class=\"col_heading level0 col1\" >source_x</th>\n",
172+
" <th id=\"T_18016_level0_col2\" class=\"col_heading level0 col2\" >title</th>\n",
173+
" <th id=\"T_18016_level0_col3\" class=\"col_heading level0 col3\" >abstract</th>\n",
174+
" <th id=\"T_18016_level0_col4\" class=\"col_heading level0 col4\" >authors</th>\n",
175+
" <th id=\"T_18016_level0_col5\" class=\"col_heading level0 col5\" >url</th>\n",
161176
" </tr>\n",
162177
" </thead>\n",
163178
" <tbody>\n",
164179
" <tr>\n",
165-
" <th id=\"T_87464_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
166-
" <td id=\"T_87464_row0_col0\" class=\"data row0 col0\" >ug7v899j</td>\n",
167-
" <td id=\"T_87464_row0_col1\" class=\"data row0 col1\" >PMC</td>\n",
168-
" <td id=\"T_87464_row0_col2\" class=\"data row0 col2\" >Clinical features of culture-p...</td>\n",
169-
" <td id=\"T_87464_row0_col3\" class=\"data row0 col3\" >OBJECTIVE: This retrospective ...</td>\n",
170-
" <td id=\"T_87464_row0_col4\" class=\"data row0 col4\" >Madani, Tariq A; Al-Ghamdi, Ai...</td>\n",
171-
" <td id=\"T_87464_row0_col5\" class=\"data row0 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/</a></td>\n",
180+
" <th id=\"T_18016_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
181+
" <td id=\"T_18016_row0_col0\" class=\"data row0 col0\" >ug7v899j</td>\n",
182+
" <td id=\"T_18016_row0_col1\" class=\"data row0 col1\" >PMC</td>\n",
183+
" <td id=\"T_18016_row0_col2\" class=\"data row0 col2\" >Clinical features of culture-p...</td>\n",
184+
" <td id=\"T_18016_row0_col3\" class=\"data row0 col3\" >OBJECTIVE: This retrospective ...</td>\n",
185+
" <td id=\"T_18016_row0_col4\" class=\"data row0 col4\" >Madani, Tariq A; Al-Ghamdi, Ai...</td>\n",
186+
" <td id=\"T_18016_row0_col5\" class=\"data row0 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/</a></td>\n",
172187
" </tr>\n",
173188
" <tr>\n",
174-
" <th id=\"T_87464_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
175-
" <td id=\"T_87464_row1_col0\" class=\"data row1 col0\" >02tnwd4m</td>\n",
176-
" <td id=\"T_87464_row1_col1\" class=\"data row1 col1\" >PMC</td>\n",
177-
" <td id=\"T_87464_row1_col2\" class=\"data row1 col2\" >Nitric oxide: a pro-inflammato...</td>\n",
178-
" <td id=\"T_87464_row1_col3\" class=\"data row1 col3\" >Inflammatory diseases of the r...</td>\n",
179-
" <td id=\"T_87464_row1_col4\" class=\"data row1 col4\" >Vliet, Albert van der; Eiseric...</td>\n",
180-
" <td id=\"T_87464_row1_col5\" class=\"data row1 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/</a></td>\n",
189+
" <th id=\"T_18016_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
190+
" <td id=\"T_18016_row1_col0\" class=\"data row1 col0\" >02tnwd4m</td>\n",
191+
" <td id=\"T_18016_row1_col1\" class=\"data row1 col1\" >PMC</td>\n",
192+
" <td id=\"T_18016_row1_col2\" class=\"data row1 col2\" >Nitric oxide: a pro-inflammato...</td>\n",
193+
" <td id=\"T_18016_row1_col3\" class=\"data row1 col3\" >Inflammatory diseases of the r...</td>\n",
194+
" <td id=\"T_18016_row1_col4\" class=\"data row1 col4\" >Vliet, Albert van der; Eiseric...</td>\n",
195+
" <td id=\"T_18016_row1_col5\" class=\"data row1 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/</a></td>\n",
181196
" </tr>\n",
182197
" <tr>\n",
183-
" <th id=\"T_87464_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
184-
" <td id=\"T_87464_row2_col0\" class=\"data row2 col0\" >ejv2xln0</td>\n",
185-
" <td id=\"T_87464_row2_col1\" class=\"data row2 col1\" >PMC</td>\n",
186-
" <td id=\"T_87464_row2_col2\" class=\"data row2 col2\" >Surfactant protein-D and pulmo...</td>\n",
187-
" <td id=\"T_87464_row2_col3\" class=\"data row2 col3\" >Surfactant protein-D (SP-D) pa...</td>\n",
188-
" <td id=\"T_87464_row2_col4\" class=\"data row2 col4\" >Crouch, Erika C...</td>\n",
189-
" <td id=\"T_87464_row2_col5\" class=\"data row2 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/</a></td>\n",
198+
" <th id=\"T_18016_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
199+
" <td id=\"T_18016_row2_col0\" class=\"data row2 col0\" >ejv2xln0</td>\n",
200+
" <td id=\"T_18016_row2_col1\" class=\"data row2 col1\" >PMC</td>\n",
201+
" <td id=\"T_18016_row2_col2\" class=\"data row2 col2\" >Surfactant protein-D and pulmo...</td>\n",
202+
" <td id=\"T_18016_row2_col3\" class=\"data row2 col3\" >Surfactant protein-D (SP-D) pa...</td>\n",
203+
" <td id=\"T_18016_row2_col4\" class=\"data row2 col4\" >Crouch, Erika C...</td>\n",
204+
" <td id=\"T_18016_row2_col5\" class=\"data row2 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/</a></td>\n",
190205
" </tr>\n",
191206
" <tr>\n",
192-
" <th id=\"T_87464_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
193-
" <td id=\"T_87464_row3_col0\" class=\"data row3 col0\" >2b73a28n</td>\n",
194-
" <td id=\"T_87464_row3_col1\" class=\"data row3 col1\" >PMC</td>\n",
195-
" <td id=\"T_87464_row3_col2\" class=\"data row3 col2\" >Role of endothelin-1 in lung d...</td>\n",
196-
" <td id=\"T_87464_row3_col3\" class=\"data row3 col3\" >Endothelin-1 (ET-1) is a 21 am...</td>\n",
197-
" <td id=\"T_87464_row3_col4\" class=\"data row3 col4\" >Fagan, Karen A; McMurtry, Ivan...</td>\n",
198-
" <td id=\"T_87464_row3_col5\" class=\"data row3 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/</a></td>\n",
207+
" <th id=\"T_18016_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
208+
" <td id=\"T_18016_row3_col0\" class=\"data row3 col0\" >2b73a28n</td>\n",
209+
" <td id=\"T_18016_row3_col1\" class=\"data row3 col1\" >PMC</td>\n",
210+
" <td id=\"T_18016_row3_col2\" class=\"data row3 col2\" >Role of endothelin-1 in lung d...</td>\n",
211+
" <td id=\"T_18016_row3_col3\" class=\"data row3 col3\" >Endothelin-1 (ET-1) is a 21 am...</td>\n",
212+
" <td id=\"T_18016_row3_col4\" class=\"data row3 col4\" >Fagan, Karen A; McMurtry, Ivan...</td>\n",
213+
" <td id=\"T_18016_row3_col5\" class=\"data row3 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/</a></td>\n",
199214
" </tr>\n",
200215
" <tr>\n",
201-
" <th id=\"T_87464_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
202-
" <td id=\"T_87464_row4_col0\" class=\"data row4 col0\" >9785vg6d</td>\n",
203-
" <td id=\"T_87464_row4_col1\" class=\"data row4 col1\" >PMC</td>\n",
204-
" <td id=\"T_87464_row4_col2\" class=\"data row4 col2\" >Gene expression in epithelial ...</td>\n",
205-
" <td id=\"T_87464_row4_col3\" class=\"data row4 col3\" >Respiratory syncytial virus (R...</td>\n",
206-
" <td id=\"T_87464_row4_col4\" class=\"data row4 col4\" >Domachowske, Joseph B; Bonvill...</td>\n",
207-
" <td id=\"T_87464_row4_col5\" class=\"data row4 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/</a></td>\n",
216+
" <th id=\"T_18016_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
217+
" <td id=\"T_18016_row4_col0\" class=\"data row4 col0\" >9785vg6d</td>\n",
218+
" <td id=\"T_18016_row4_col1\" class=\"data row4 col1\" >PMC</td>\n",
219+
" <td id=\"T_18016_row4_col2\" class=\"data row4 col2\" >Gene expression in epithelial ...</td>\n",
220+
" <td id=\"T_18016_row4_col3\" class=\"data row4 col3\" >Respiratory syncytial virus (R...</td>\n",
221+
" <td id=\"T_18016_row4_col4\" class=\"data row4 col4\" >Domachowske, Joseph B; Bonvill...</td>\n",
222+
" <td id=\"T_18016_row4_col5\" class=\"data row4 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/</a></td>\n",
208223
" </tr>\n",
209224
" </tbody>\n",
210225
"</table>\n"
211226
],
212227
"text/plain": [
213-
"<pandas.io.formats.style.Styler at 0x7f36d4016ad0>"
228+
"<pandas.io.formats.style.Styler at 0x10c683b90>"
214229
]
215230
},
216231
"execution_count": 6,
@@ -452,9 +467,7 @@
452467
"output_type": "stream",
453468
"text": [
454469
"200\n",
455-
"Status: inProgress\n",
456-
"Items Processed: 15000\n",
457-
"True\n"
470+
"Wait a few seconds until the process starts and run this cell again.\n"
458471
]
459472
}
460473
],
@@ -582,9 +595,9 @@
582595
],
583596
"metadata": {
584597
"kernelspec": {
585-
"display_name": "Python 3.10 - SDK v2",
598+
"display_name": ".venv",
586599
"language": "python",
587-
"name": "python310-sdkv2"
600+
"name": "python3"
588601
},
589602
"language_info": {
590603
"codemirror_mode": {
@@ -596,7 +609,7 @@
596609
"name": "python",
597610
"nbconvert_exporter": "python",
598611
"pygments_lexer": "ipython3",
599-
"version": "3.10.11"
612+
"version": "3.11.5"
600613
}
601614
},
602615
"nbformat": 4,

0 commit comments

Comments
 (0)