Skip to content

Commit d78167c

Browse files
authored
Merge pull request #101 from joaquingx/add-support-dublin-core-metadata
[MRG+1] Add dublincore metadata
2 parents 7d245b3 + 043a479 commit d78167c

File tree

10 files changed

+417
-8
lines changed

10 files changed

+417
-8
lines changed

README.rst

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Currently, *extruct* supports:
2424
- `Microformat`_ via `mf2py`_
2525
- `Facebook's Open Graph`_
2626
- (experimental) `RDFa`_ via `rdflib`_
27+
- `Dublin Core Metadata (DC-HTML-2003)`_
2728

2829
.. _W3C's HTML Microdata: http://www.w3.org/TR/microdata/
2930
.. _embedded JSON-LD: http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents
@@ -32,6 +33,7 @@ Currently, *extruct* supports:
3233
.. _Microformat: http://microformats.org/wiki/Main_Page
3334
.. _mf2py: https://github.com/microformats/mf2py
3435
.. _Facebook's Open Graph: http://ogp.me/
36+
.. _Dublin Core Metadata (DC-HTML-2003): https://www.dublincore.org/specifications/dublin-core/dcq-html/2003-11-30/
3537

3638
The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing how to use EXSLT extensions.
3739

@@ -71,7 +73,17 @@ First fetch the HTML using python-requests and then feed the response body to ``
7173
>>> data = extruct.extract(r.text, base_url=base_url)
7274
>>>
7375
>>> pp.pprint(data)
74-
{ 'json-ld': [ { '@context': 'https://schema.org',
76+
{ 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/description',
77+
'content': 'What is Open Graph Protocol '
78+
'and why you need it? Learn to '
79+
'implement Open Graph Protocol '
80+
'for Facebook on your website. '
81+
'Open Graph Protocol Meta Tags.',
82+
'name': 'description'}],
83+
'namespaces': {},
84+
'terms': []}],
85+
86+
'json-ld': [ { '@context': 'https://schema.org',
7587
'@id': '#organization',
7688
'@type': 'Organization',
7789
'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg',
@@ -163,7 +175,7 @@ First fetch the HTML using python-requests and then feed the response body to ``
163175

164176
Select syntaxes
165177
+++++++++++++++
166-
It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned::
178+
It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa' and 'dublincore'. If no list is passed all syntaxes will be extracted and returned::
167179

168180
>>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
169181
>>> base_url = get_base_url(r.text, r.url)
@@ -207,9 +219,9 @@ It is possible to select which syntaxes to extract by passing a list with the de
207219

208220
Uniform
209221
+++++++
210-
Another option is to uniform the output of microformat, opengraph, microdata and json-ld syntaxes to the following structure: ::
222+
Another option is to uniform the output of microformat, opengraph, microdata, dublincore and json-ld syntaxes to the following structure: ::
211223

212-
{'@context': 'http://example.com',
224+
{'@context': 'http://example.com',
213225
'@type': 'example_type',
214226
/* All other the properties in keys here */
215227
}
@@ -584,6 +596,80 @@ Microformat extraction
584596
}
585597
}]
586598

599+
DublinCore extraction
600+
++++++++++++++++++++++++++++++
601+
::
602+
603+
>>> import pprint
604+
>>> pp = pprint.PrettyPrinter(indent=2)
605+
>>> from extruct.dublincore import DublinCoreExtractor
606+
>>> html = '''<head profile="http://dublincore.org/documents/dcq-html/">
607+
... <title>Expressing Dublin Core in HTML/XHTML meta and link elements</title>
608+
... <link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
609+
... <link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
610+
...
611+
...
612+
... <meta name="DC.title" lang="en" content="Expressing Dublin Core
613+
... in HTML/XHTML meta and link elements" />
614+
... <meta name="DC.creator" content="Andy Powell, UKOLN, University of Bath" />
615+
... <meta name="DCTERMS.issued" scheme="DCTERMS.W3CDTF" content="2003-11-01" />
616+
... <meta name="DC.identifier" scheme="DCTERMS.URI"
617+
... content="http://dublincore.org/documents/dcq-html/" />
618+
... <link rel="DCTERMS.replaces" hreflang="en"
619+
... href="http://dublincore.org/documents/2000/08/15/dcq-html/" />
620+
... <meta name="DCTERMS.abstract" content="This document describes how
621+
... qualified Dublin Core metadata can be encoded
622+
... in HTML/XHTML &lt;meta&gt; elements" />
623+
... <meta name="DC.format" scheme="DCTERMS.IMT" content="text/html" />
624+
... <meta name="DC.type" scheme="DCTERMS.DCMIType" content="Text" />
625+
... <meta name="DC.Date.modified" content="2001-07-18" />
626+
... <meta name="DCTERMS.modified" content="2001-07-18" />'''
627+
>>> dublinlde = DublinCoreExtractor()
628+
>>> data = dublinlde.extract(html)
629+
>>> pp.pprint(data)
630+
[ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/title',
631+
'content': 'Expressing Dublin Core\n'
632+
'in HTML/XHTML meta and link elements',
633+
'lang': 'en',
634+
'name': 'DC.title'},
635+
{ 'URI': 'http://purl.org/dc/elements/1.1/creator',
636+
'content': 'Andy Powell, UKOLN, University of Bath',
637+
'name': 'DC.creator'},
638+
{ 'URI': 'http://purl.org/dc/elements/1.1/identifier',
639+
'content': 'http://dublincore.org/documents/dcq-html/',
640+
'name': 'DC.identifier',
641+
'scheme': 'DCTERMS.URI'},
642+
{ 'URI': 'http://purl.org/dc/elements/1.1/format',
643+
'content': 'text/html',
644+
'name': 'DC.format',
645+
'scheme': 'DCTERMS.IMT'},
646+
{ 'URI': 'http://purl.org/dc/elements/1.1/type',
647+
'content': 'Text',
648+
'name': 'DC.type',
649+
'scheme': 'DCTERMS.DCMIType'}],
650+
'namespaces': { 'DC': 'http://purl.org/dc/elements/1.1/',
651+
'DCTERMS': 'http://purl.org/dc/terms/'},
652+
'terms': [ { 'URI': 'http://purl.org/dc/terms/issued',
653+
'content': '2003-11-01',
654+
'name': 'DCTERMS.issued',
655+
'scheme': 'DCTERMS.W3CDTF'},
656+
{ 'URI': 'http://purl.org/dc/terms/abstract',
657+
'content': 'This document describes how\n'
658+
'qualified Dublin Core metadata can be encoded\n'
659+
'in HTML/XHTML <meta> elements',
660+
'name': 'DCTERMS.abstract'},
661+
{ 'URI': 'http://purl.org/dc/terms/modified',
662+
'content': '2001-07-18',
663+
'name': 'DC.Date.modified'},
664+
{ 'URI': 'http://purl.org/dc/terms/modified',
665+
'content': '2001-07-18',
666+
'name': 'DCTERMS.modified'},
667+
{ 'URI': 'http://purl.org/dc/terms/replaces',
668+
'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/',
669+
'hreflang': 'en',
670+
'rel': 'DCTERMS.replaces'}]}]
671+
672+
587673

588674
Command Line Tool
589675
-----------------
@@ -622,7 +708,7 @@ those, you can pass their individual names collected in a list through 'syntaxes
622708
For example, this command extracts only Microdata and JSON-LD metadata from
623709
"http://example.com"::
624710

625-
extruct "http://example.com" --syntaxes microdata json-ld
711+
extruct "http://example.com" --syntaxes microdata json-ld
626712

627713
NB syntaxes names passed must correspond to these: microdata, json-ld, rdfa, opengraph, microformat
628714

extruct/_extruct.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
from extruct.w3cmicrodata import MicrodataExtractor
77
from extruct.opengraph import OpenGraphExtractor
88
from extruct.microformat import MicroformatExtractor
9-
from extruct.uniform import _umicrodata_microformat, _uopengraph
9+
from extruct.dublincore import DublinCoreExtractor
10+
from extruct.uniform import _umicrodata_microformat, _uopengraph, _udublincore
1011
from extruct.utils import parse_xmldom_html
1112

1213
logger = logging.getLogger(__name__)
13-
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
14+
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore']
1415

1516

1617
def extract(htmlstring,
@@ -96,6 +97,11 @@ def extract(htmlstring,
9697
('rdfa', RDFaExtractor().extract_items,
9798
tree,
9899
))
100+
if 'dublincore' in syntaxes:
101+
processors.append(
102+
('dublincore', DublinCoreExtractor().extract_items,
103+
tree,
104+
))
99105
output = {}
100106
for syntax, extract, document in processors:
101107
try:
@@ -132,10 +138,20 @@ def extract(htmlstring,
132138
output['opengraph'],
133139
None,
134140
))
141+
if 'dublincore' in syntaxes:
142+
uniform_processors.append(
143+
('dublincore',
144+
_udublincore,
145+
output['dublincore'],
146+
None,
147+
))
148+
135149
for syntax, uniform, raw, schema_context in uniform_processors:
136150
try:
137151
if syntax == 'opengraph':
138152
output[syntax] = uniform(raw, with_og_array=with_og_array)
153+
elif syntax == 'dublincore':
154+
output[syntax] = uniform(raw)
139155
else:
140156
output[syntax] = uniform(raw, schema_context)
141157
except Exception as e:

extruct/dublincore.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import re
2+
3+
from extruct.utils import parse_html
4+
from w3lib.html import strip_html5_whitespace
5+
6+
_DC_ELEMENTS = { # Defined according DCMES(DCM Version 1.1): http://dublincore.org/documents/dces/
7+
'contributor': 'http://purl.org/dc/elements/1.1/contributor',
8+
'coverage': 'http://purl.org/dc/elements/1.1/coverage',
9+
'creator': 'http://purl.org/dc/elements/1.1/creator',
10+
'date': 'http://purl.org/dc/elements/1.1/date',
11+
'description': 'http://purl.org/dc/elements/1.1/description',
12+
'format': 'http://purl.org/dc/elements/1.1/format',
13+
'identifier': 'http://purl.org/dc/elements/1.1/identifier',
14+
'language': 'http://purl.org/dc/elements/1.1/language',
15+
'publisher': 'http://purl.org/dc/elements/1.1/publiser',
16+
'relation': 'http://purl.org/dc/elements/1.1/relation',
17+
'rights': 'http://purl.org/dc/elements/1.1/rights',
18+
'source': 'http://purl.org/dc/elements/1.1/source',
19+
'subject': 'http://purl.org/dc/elements/1.1/subject',
20+
'title': 'http://purl.org/dc/elements/1.1/title',
21+
'type': 'http://purl.org/dc/elements/1.1/type',
22+
}
23+
24+
_DC_TERMS = { # Defined according: http://dublincore.org/documents/2008/01/14/dcmi-terms/
25+
'abstract': 'http://purl.org/dc/terms/abstract',
26+
'description': 'http://purl.org/dc/terms/description',
27+
'accessrights': 'http://purl.org/dc/terms/accessRights',
28+
'rights': 'http://purl.org/dc/terms/rights',
29+
'rightsstatement': 'http://purl.org/dc/terms/RightsStatement',
30+
'accrualmethod': 'http://purl.org/dc/terms/accrualMethod',
31+
'collection': 'http://purl.org/dc/terms/Collection',
32+
'methodOfaccrual': 'http://purl.org/dc/terms/MethodOfAccrual',
33+
'accrualperiodicity': 'http://purl.org/dc/terms/accrualPeriodicity',
34+
'frequency': 'http://purl.org/dc/terms/Frequency',
35+
'accrualpolicy': 'http://purl.org/dc/terms/accrualPolicy',
36+
'policy': 'http://purl.org/dc/terms/Policy',
37+
'alternative': 'http://purl.org/dc/terms/alternative',
38+
'title': 'http://purl.org/dc/terms/title',
39+
'audience': 'http://purl.org/dc/terms/audience',
40+
'agentclass': 'http://purl.org/dc/terms/AgentClass',
41+
'available': 'http://purl.org/dc/terms/available',
42+
'date': 'http://purl.org/dc/terms/date',
43+
'bibliographiccitation': 'http://purl.org/dc/terms/bibliographicCitation',
44+
'identifier': 'http://purl.org/dc/terms/identifier',
45+
'bibliographicresource': 'http://purl.org/dc/terms/BibliographicResource',
46+
'conformsto': 'http://purl.org/dc/terms/conformsTo',
47+
'relation': 'http://purl.org/dc/terms/relation',
48+
'standard': 'http://purl.org/dc/terms/Standard',
49+
'contributor': 'http://purl.org/dc/terms/contributor',
50+
'agent': 'http://purl.org/dc/terms/Agent',
51+
'coverage': 'http://purl.org/dc/terms/coverage',
52+
'locationperiodorjurisdiction': 'http://purl.org/dc/terms/LocationPeriodOrJurisdiction',
53+
'created': 'http://purl.org/dc/terms/created',
54+
'creator': 'http://purl.org/dc/terms/creator',
55+
'dateaccepted': 'http://purl.org/dc/terms/dateAccepted',
56+
'datecopyrighted': 'http://purl.org/dc/terms/dateCopyrighted',
57+
'datesubmitted': 'http://purl.org/dc/terms/dateSubmitted',
58+
'educationlevel': 'http://purl.org/dc/terms/educationLevel',
59+
'extent': 'http://purl.org/dc/terms/extent',
60+
'format': 'http://purl.org/dc/terms/format',
61+
'sizeorduration': 'http://purl.org/dc/terms/SizeOrDuration',
62+
'mediatypeorextent': 'http://purl.org/dc/terms/MediaTypeOrExtent',
63+
'hasformat': 'http://purl.org/dc/terms/hasFormat',
64+
'haspart': 'http://purl.org/dc/terms/hasPart',
65+
'hasversion': 'http://purl.org/dc/terms/hasVersion',
66+
'instructionalmethod': 'http://purl.org/dc/terms/instructionalMethod',
67+
'methodofinstruction': 'http://purl.org/dc/terms/MethodOfInstruction',
68+
'isformatof': 'http://purl.org/dc/terms/isFormatOf',
69+
'ispartof': 'http://purl.org/dc/terms/isPartOf',
70+
'isreferencedby': 'http://purl.org/dc/terms/isReferencedBy',
71+
'isreplacedby': 'http://purl.org/dc/terms/isReplacedBy',
72+
'isrequiredby': 'http://purl.org/dc/terms/isRequiredBy',
73+
'issued': 'http://purl.org/dc/terms/issued',
74+
'isversionof': 'http://purl.org/dc/terms/isVersionOf',
75+
'language': 'http://purl.org/dc/terms/language',
76+
'linguisticsystem': 'http://purl.org/dc/terms/LinguisticSystem',
77+
'license': 'http://purl.org/dc/terms/license',
78+
'licensedocument': 'http://purl.org/dc/terms/LicenseDocument',
79+
'mediator': 'http://purl.org/dc/terms/mediator',
80+
'medium': 'http://purl.org/dc/terms/medium',
81+
'physicalresource': 'http://purl.org/dc/terms/PhysicalResource',
82+
'physicalmedium': 'http://purl.org/dc/terms/PhysicalMedium',
83+
'modified': 'http://purl.org/dc/terms/modified',
84+
'provenance': 'http://purl.org/dc/terms/provenance',
85+
'provenancestatement': 'http://purl.org/dc/terms/ProvenanceStatement',
86+
'publisher': 'http://purl.org/dc/terms/publisher',
87+
'references': 'http://purl.org/dc/terms/references',
88+
'replaces': 'http://purl.org/dc/terms/replaces',
89+
'requires': 'http://purl.org/dc/terms/requires',
90+
'rightsholder': 'http://purl.org/dc/terms/rightsHolder',
91+
'source': 'http://purl.org/dc/terms/source',
92+
'spatial': 'http://purl.org/dc/terms/spatial',
93+
'location': 'http://purl.org/dc/terms/Location',
94+
'subject': 'http://purl.org/dc/terms/subject',
95+
'tableofcontents': 'http://purl.org/dc/terms/tableOfContents',
96+
'temporal': 'http://purl.org/dc/terms/temporal',
97+
'periodoftime': 'http://purl.org/dc/terms/PeriodOfTime',
98+
'type': 'http://purl.org/dc/terms/type',
99+
'valid': 'http://purl.org/dc/terms/valid',
100+
}
101+
102+
_URL_NAMESPACES = ['http://purl.org/dc/terms/', 'http://purl.org/dc/elements/1.1/']
103+
104+
105+
def get_lower_attrib(name):
106+
# get attribute to compare against _DC_TERMS or _DC_ELEMENTS
107+
return re.sub(r".*\.", "", name).lower()
108+
109+
110+
class DublinCoreExtractor(object):
111+
"""DublinCore extractor following extruct API."""
112+
113+
def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
114+
tree = parse_html(htmlstring, encoding=encoding)
115+
return list(self.extract_items(tree, base_url=base_url))
116+
117+
def extract_items(self, document, base_url=None):
118+
elements = []
119+
terms = []
120+
121+
def attrib_to_dict(attribs):
122+
# convert _attrib type to dict
123+
return dict(attribs.items())
124+
125+
def populate_results(node, main_attrib):
126+
# fill list with DC Elements or DC Terms
127+
node_attrib = node.attrib
128+
if main_attrib not in node_attrib:
129+
return
130+
131+
name = node.attrib[main_attrib]
132+
lower_name = get_lower_attrib(name)
133+
if lower_name in _DC_ELEMENTS:
134+
node.attrib.update({'URI': _DC_ELEMENTS[lower_name]})
135+
elements.append(attrib_to_dict(node.attrib))
136+
137+
elif lower_name in _DC_TERMS:
138+
node.attrib.update({'URI': _DC_TERMS[lower_name]})
139+
terms.append(attrib_to_dict(node.attrib))
140+
141+
namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]')
142+
namespaces = {}
143+
for i in namespaces_nodes:
144+
url = strip_html5_whitespace(i.attrib['href'])
145+
if url in _URL_NAMESPACES:
146+
namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): url})
147+
148+
list_meta_node = document.xpath('//meta')
149+
for meta_node in list_meta_node:
150+
populate_results(meta_node, 'name')
151+
152+
list_link_node = document.xpath('//link')
153+
for link_node in list_link_node:
154+
populate_results(link_node, 'rel')
155+
156+
yield {'namespaces': namespaces, 'elements': elements, 'terms': terms}

extruct/uniform.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import copy
12
from six.moves.urllib.parse import urlparse, urljoin
3+
from extruct.dublincore import get_lower_attrib
24

35

46
def _uopengraph(extracted, with_og_array=False):
@@ -42,6 +44,23 @@ def _umicrodata_microformat(extracted, schema_context):
4244
return res
4345

4446

47+
def _udublincore(extracted):
48+
out = []
49+
extracted_cpy = copy.deepcopy(extracted)
50+
for obj in extracted_cpy:
51+
context = obj.pop('namespaces', None)
52+
obj['@context'] = context
53+
elements = obj['elements']
54+
for element in elements:
55+
for key, value in element.items():
56+
if get_lower_attrib(value) == 'type':
57+
obj['@type'] = element['content']
58+
obj['elements'].remove(element)
59+
break
60+
out.append(obj)
61+
return out
62+
63+
4564
def _flatten(element, schema_context):
4665
if isinstance(element, dict):
4766
element = flatten_dict(element, schema_context, False)

0 commit comments

Comments
 (0)