|
30 | 30 | "EN = spacy.load('en')" |
31 | 31 | ] |
32 | 32 | }, |
| 33 | + { |
| 34 | + "cell_type": "markdown", |
| 35 | + "metadata": {}, |
| 36 | + "source": [ |
| 37 | + "## Download raw python files" |
| 38 | + ] |
| 39 | + }, |
| 40 | + { |
| 41 | + "cell_type": "code", |
| 42 | + "execution_count": null, |
| 43 | + "metadata": { |
| 44 | + "collapsed": true |
| 45 | + }, |
| 46 | + "outputs": [], |
| 47 | + "source": [] |
| 48 | + }, |
33 | 49 | { |
34 | 50 | "cell_type": "markdown", |
35 | 51 | "metadata": {}, |
|
107 | 123 | " ' '.join(tokenize_code(function)),\n", |
108 | 124 | " ' '.join(tokenize_docstring(docstring.split('\\n\\n')[0]))\n", |
109 | 125 | " ))\n", |
110 | | - " except (SyntaxError, MemoryError, UnicodeEncodeError):\n", |
| 126 | + " except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):\n", |
111 | 127 | " pass\n", |
112 | 128 | " return pairs" |
113 | 129 | ] |
|
251 | 267 | "cell_type": "markdown", |
252 | 268 | "metadata": {}, |
253 | 269 | "source": [ |
254 | | - "## Output each set to train/valid/test.function/docstrings/lineage files" |
| 270 | + "## Output each set to train/valid/test.function/docstrings/lineage files\n", |
| 271 | + "Original functions are also written to compressed json files. (Raw functions contain `,`, `\\t`, `\\n`, etc., it is less error-prone using json format)" |
255 | 272 | ] |
256 | 273 | }, |
257 | 274 | { |
|
264 | 281 | "source": [ |
265 | 282 | "def write_to(df, filename):\n", |
266 | 283 | " df.function_tokens.to_csv('{}.function'.format(filename), index=False)\n", |
| 284 | + " df.original_function.to_json('{}_original_function.json.gz'.format(filename), orient='values', compression='gzip')\n", |
267 | 285 | " if filename != 'without_docstrings':\n", |
268 | 286 | " df.docstring_tokens.to_csv('{}.docstring'.format(filename), index=False)\n", |
269 | 287 | " df.url.to_csv('{}.lineage'.format(filename), index=False)" |
|
0 commit comments