Merge branch 'master' into commitByDefault

Evan Fagerberg · web-flow · commit f8080eceded9 · 2018-09-03T16:18:45.000-04:00
diff --git a/.github/stale.yml b/.github/stale.yml
@@ -0,0 +1,17 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 90
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 30
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+# Label to use when marking an issue as stale
+staleLabel: stale
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
diff --git a/pysolr.py b/pysolr.py
@@ -31,6 +31,13 @@
     # Python 2.X
     from urllib import urlencode
 
+try:
+    # Python 3.X
+    from urllib.parse import quote
+except ImportError:
+    # Python 2.X
+    from urllib import quote
+
 try:
     # Python 3.X
     import html.entities as htmlentities
@@ -1032,13 +1039,13 @@ def extract(self, file_obj, extractOnly=True, handler='update/extract', **kwargs
             "wt": "json",
         }
         params.update(kwargs)
-
+        filename = quote(file_obj.name.encode('utf-8'))
         try:
             # We'll provide the file using its true name as Tika may use that
             # as a file type hint:
             resp = self._send_request('post', handler,
                                       body=params,
-                                      files={'file': (file_obj.name, file_obj)})
+                                      files={'file': (filename, file_obj)})
         except (IOError, SolrError) as err:
             self.log.error("Failed to extract document metadata: %s", err,
                            exc_info=True)
@@ -1051,10 +1058,10 @@ def extract(self, file_obj, extractOnly=True, handler='update/extract', **kwargs
                            exc_info=True)
             raise
 
-        data['contents'] = data.pop(file_obj.name, None)
+        data['contents'] = data.pop(filename, None)
         data['metadata'] = metadata = {}
 
-        raw_metadata = data.pop("%s_metadata" % file_obj.name, None)
+        raw_metadata = data.pop("%s_metadata" % filename, None)
 
         if raw_metadata:
             # The raw format is somewhat annoying: it's a flat list of
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -22,6 +22,11 @@
 except ImportError:
     from urllib import unquote_plus
 
+try:
+    from urllib.parse import quote
+except ImportError:
+    from urllib import quote
+
 
 class UtilsTestCase(unittest.TestCase):
     def test_unescape_html(self):
@@ -877,6 +882,47 @@ def test_extract(self):
         # round-trip:
         self.assertEqual(['Test Title ☃☃'], m['title'])
 
+    def test_extract_special_char_in_filename(self):
+        fake_f = StringIO("""
+            <html>
+                <head>
+                    <meta charset="utf-8">
+                    <meta name="haystack-test" content="test 1234">
+                    <title>Test Title ☃&#x2603;</title>
+                </head>
+                    <body>foobar</body>
+            </html>
+        """)
+        fake_f.name = u"test☃.html"
+        extracted = self.solr.extract(fake_f)
+        # extract should default to 'update/extract' handler
+        args, kwargs = self.solr._send_request.call_args
+        self.assertTrue(args[1].startswith('update/extract'))
+
+        # extract should support custom handlers
+        with self.assertRaises(SolrError):
+            self.solr.extract(fake_f, handler='fakehandler')
+        args, kwargs = self.solr._send_request.call_args
+        self.assertTrue(args[1].startswith('fakehandler'))
+
+        # Verify documented response structure:
+        self.assertIn('contents', extracted)
+        self.assertIn('metadata', extracted)
+
+        self.assertIn('foobar', extracted['contents'])
+
+        m = extracted['metadata']
+
+        self.assertEqual([quote(fake_f.name.encode('utf-8'))], m['stream_name'])
+
+        self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
+        self.assertEqual(['test 1234'], m['haystack-test'])
+
+        # Note the underhanded use of a double snowman to verify both that Tika
+        # correctly decoded entities and that our UTF-8 characters survived the
+        # round-trip:
+        self.assertEqual(['Test Title ☃☃'], m['title'])
+
     def test_full_url(self):
         self.solr.url = 'http://localhost:8983/solr/core0'
         full_url = self.solr._create_full_url(path='/update')