Skip to content

Commit f8080ec

Browse files
author
Evan Fagerberg
authored
Merge branch 'master' into commitByDefault
2 parents d138ec5 + fcbf73e commit f8080ec

File tree

3 files changed

+74
-4
lines changed

3 files changed

+74
-4
lines changed

.github/stale.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Number of days of inactivity before an issue becomes stale
2+
daysUntilStale: 90
3+
# Number of days of inactivity before a stale issue is closed
4+
daysUntilClose: 30
5+
# Issues with these labels will never be considered stale
6+
exemptLabels:
7+
- pinned
8+
- security
9+
# Label to use when marking an issue as stale
10+
staleLabel: stale
11+
# Comment to post when marking an issue as stale. Set to `false` to disable
12+
markComment: >
13+
This issue has been automatically marked as stale because it has not had
14+
recent activity. It will be closed if no further activity occurs. Thank you
15+
for your contributions.
16+
# Comment to post when closing a stale issue. Set to `false` to disable
17+
closeComment: false

pysolr.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@
3131
# Python 2.X
3232
from urllib import urlencode
3333

34+
try:
35+
# Python 3.X
36+
from urllib.parse import quote
37+
except ImportError:
38+
# Python 2.X
39+
from urllib import quote
40+
3441
try:
3542
# Python 3.X
3643
import html.entities as htmlentities
@@ -1032,13 +1039,13 @@ def extract(self, file_obj, extractOnly=True, handler='update/extract', **kwargs
10321039
"wt": "json",
10331040
}
10341041
params.update(kwargs)
1035-
1042+
filename = quote(file_obj.name.encode('utf-8'))
10361043
try:
10371044
# We'll provide the file using its true name as Tika may use that
10381045
# as a file type hint:
10391046
resp = self._send_request('post', handler,
10401047
body=params,
1041-
files={'file': (file_obj.name, file_obj)})
1048+
files={'file': (filename, file_obj)})
10421049
except (IOError, SolrError) as err:
10431050
self.log.error("Failed to extract document metadata: %s", err,
10441051
exc_info=True)
@@ -1051,10 +1058,10 @@ def extract(self, file_obj, extractOnly=True, handler='update/extract', **kwargs
10511058
exc_info=True)
10521059
raise
10531060

1054-
data['contents'] = data.pop(file_obj.name, None)
1061+
data['contents'] = data.pop(filename, None)
10551062
data['metadata'] = metadata = {}
10561063

1057-
raw_metadata = data.pop("%s_metadata" % file_obj.name, None)
1064+
raw_metadata = data.pop("%s_metadata" % filename, None)
10581065

10591066
if raw_metadata:
10601067
# The raw format is somewhat annoying: it's a flat list of

tests/test_client.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
except ImportError:
2323
from urllib import unquote_plus
2424

25+
try:
26+
from urllib.parse import quote
27+
except ImportError:
28+
from urllib import quote
29+
2530

2631
class UtilsTestCase(unittest.TestCase):
2732
def test_unescape_html(self):
@@ -877,6 +882,47 @@ def test_extract(self):
877882
# round-trip:
878883
self.assertEqual(['Test Title ☃☃'], m['title'])
879884

885+
def test_extract_special_char_in_filename(self):
886+
fake_f = StringIO("""
887+
<html>
888+
<head>
889+
<meta charset="utf-8">
890+
<meta name="haystack-test" content="test 1234">
891+
<title>Test Title ☃&#x2603;</title>
892+
</head>
893+
<body>foobar</body>
894+
</html>
895+
""")
896+
fake_f.name = u"test☃.html"
897+
extracted = self.solr.extract(fake_f)
898+
# extract should default to 'update/extract' handler
899+
args, kwargs = self.solr._send_request.call_args
900+
self.assertTrue(args[1].startswith('update/extract'))
901+
902+
# extract should support custom handlers
903+
with self.assertRaises(SolrError):
904+
self.solr.extract(fake_f, handler='fakehandler')
905+
args, kwargs = self.solr._send_request.call_args
906+
self.assertTrue(args[1].startswith('fakehandler'))
907+
908+
# Verify documented response structure:
909+
self.assertIn('contents', extracted)
910+
self.assertIn('metadata', extracted)
911+
912+
self.assertIn('foobar', extracted['contents'])
913+
914+
m = extracted['metadata']
915+
916+
self.assertEqual([quote(fake_f.name.encode('utf-8'))], m['stream_name'])
917+
918+
self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
919+
self.assertEqual(['test 1234'], m['haystack-test'])
920+
921+
# Note the underhanded use of a double snowman to verify both that Tika
922+
# correctly decoded entities and that our UTF-8 characters survived the
923+
# round-trip:
924+
self.assertEqual(['Test Title ☃☃'], m['title'])
925+
880926
def test_full_url(self):
881927
self.solr.url = 'http://localhost:8983/solr/core0'
882928
full_url = self.solr._create_full_url(path='/update')

0 commit comments

Comments
 (0)