Skip to content

Commit 28762e8

Browse files
committed
added regex pattern matching for suffixes
1 parent ed9ad8b commit 28762e8

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

src/widid.ipynb

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,20 +171,24 @@
171171
},
172172
{
173173
"cell_type": "code",
174-
"execution_count": 7,
174+
"execution_count": null,
175175
"id": "342d2e07",
176176
"metadata": {
177177
"lines_to_next_cell": 1
178178
},
179179
"outputs": [],
180180
"source": [
181181
"def extract_contexts(texts, target_word, window=10):\n",
182-
" \"\"\"Extract short context windows around target word.\"\"\"\n",
182+
" \"\"\"Extract short context windows around target word and its morphological variations.\"\"\"\n",
183183
" contexts = []\n",
184+
" # Create regex pattern to match target word and any Turkish suffixes\n",
185+
" pattern = re.compile(rf\"\\b{re.escape(target_word.lower())}\\w*\\b\")\n",
186+
" \n",
184187
" for t in texts:\n",
185188
" tokens = re.findall(r\"\\w+\", t.lower())\n",
186189
" for i, tok in enumerate(tokens):\n",
187-
" if tok == target_word.lower():\n",
190+
" # Use regex to match the word and its variations\n",
191+
" if pattern.match(tok):\n",
188192
" start = max(0, i - window)\n",
189193
" end = min(len(tokens), i + window + 1)\n",
190194
" snippet = \" \".join(tokens[start:end])\n",

0 commit comments

Comments
 (0)