|
22 | 22 | except ImportError: |
23 | 23 | from urllib import unquote_plus |
24 | 24 |
|
| 25 | +try: |
| 26 | + from urllib.parse import quote |
| 27 | +except ImportError: |
| 28 | + from urllib import quote |
| 29 | + |
25 | 30 |
|
26 | 31 | class UtilsTestCase(unittest.TestCase): |
27 | 32 | def test_unescape_html(self): |
@@ -877,6 +882,47 @@ def test_extract(self): |
877 | 882 | # round-trip: |
878 | 883 | self.assertEqual(['Test Title ☃☃'], m['title']) |
879 | 884 |
|
| 885 | + def test_extract_special_char_in_filename(self): |
| 886 | + fake_f = StringIO(""" |
| 887 | + <html> |
| 888 | + <head> |
| 889 | + <meta charset="utf-8"> |
| 890 | + <meta name="haystack-test" content="test 1234"> |
| 891 | + <title>Test Title ☃☃</title> |
| 892 | + </head> |
| 893 | + <body>foobar</body> |
| 894 | + </html> |
| 895 | + """) |
| 896 | + fake_f.name = u"test☃.html" |
| 897 | + extracted = self.solr.extract(fake_f) |
| 898 | + # extract should default to 'update/extract' handler |
| 899 | + args, kwargs = self.solr._send_request.call_args |
| 900 | + self.assertTrue(args[1].startswith('update/extract')) |
| 901 | + |
| 902 | + # extract should support custom handlers |
| 903 | + with self.assertRaises(SolrError): |
| 904 | + self.solr.extract(fake_f, handler='fakehandler') |
| 905 | + args, kwargs = self.solr._send_request.call_args |
| 906 | + self.assertTrue(args[1].startswith('fakehandler')) |
| 907 | + |
| 908 | + # Verify documented response structure: |
| 909 | + self.assertIn('contents', extracted) |
| 910 | + self.assertIn('metadata', extracted) |
| 911 | + |
| 912 | + self.assertIn('foobar', extracted['contents']) |
| 913 | + |
| 914 | + m = extracted['metadata'] |
| 915 | + |
| 916 | + self.assertEqual([quote(fake_f.name.encode('utf-8'))], m['stream_name']) |
| 917 | + |
| 918 | + self.assertIn('haystack-test', m, "HTML metadata should have been extracted!") |
| 919 | + self.assertEqual(['test 1234'], m['haystack-test']) |
| 920 | + |
| 921 | + # Note the underhanded use of a double snowman to verify both that Tika |
| 922 | + # correctly decoded entities and that our UTF-8 characters survived the |
| 923 | + # round-trip: |
| 924 | + self.assertEqual(['Test Title ☃☃'], m['title']) |
| 925 | + |
880 | 926 | def test_full_url(self): |
881 | 927 | self.solr.url = 'http://localhost:8983/solr/core0' |
882 | 928 | full_url = self.solr._create_full_url(path='/update') |
|
0 commit comments