Merge branch 'master' of github.com:hamelsmu/code_search

hamelsmu · hamelsmu · commit 5d529819a6d8 · 2018-05-17T23:15:53.000-07:00
diff --git a/notebooks/1 - Preprocess Data.ipynb b/notebooks/1 - Preprocess Data.ipynb
@@ -30,6 +30,22 @@
     "EN = spacy.load('en')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download raw python files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -107,7 +123,7 @@
     "                          ' '.join(tokenize_code(function)),\n",
     "                          ' '.join(tokenize_docstring(docstring.split('\\n\\n')[0]))\n",
     "                         ))\n",
-    "    except (SyntaxError, MemoryError, UnicodeEncodeError):\n",
+    "    except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):\n",
     "        pass\n",
     "    return pairs"
    ]
@@ -251,7 +267,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Output each set to train/valid/test.function/docstrings/lineage files"
+    "## Output each set to train/valid/test.function/docstrings/lineage files\n",
+    "Original functions are also written to compressed json files. (Raw functions contain `,`, `\\t`, `\\n`, etc., it is less error-prone using json format)"
    ]
   },
   {
@@ -264,6 +281,7 @@
    "source": [
     "def write_to(df, filename):\n",
     "    df.function_tokens.to_csv('{}.function'.format(filename), index=False)\n",
+    "    df.original_function.to_json('{}_original_function.json.gz'.format(filename), orient='values', compression='gzip')\n",
     "    if filename != 'without_docstrings':\n",
     "        df.docstring_tokens.to_csv('{}.docstring'.format(filename), index=False)\n",
     "    df.url.to_csv('{}.lineage'.format(filename), index=False)"