|
| 1 | +import re |
| 2 | + |
| 3 | +from extruct.utils import parse_html |
| 4 | +from w3lib.html import strip_html5_whitespace |
| 5 | + |
| 6 | +_DC_ELEMENTS = { # Defined according DCMES(DCM Version 1.1): http://dublincore.org/documents/dces/ |
| 7 | + 'contributor': 'http://purl.org/dc/elements/1.1/contributor', |
| 8 | + 'coverage': 'http://purl.org/dc/elements/1.1/coverage', |
| 9 | + 'creator': 'http://purl.org/dc/elements/1.1/creator', |
| 10 | + 'date': 'http://purl.org/dc/elements/1.1/date', |
| 11 | + 'description': 'http://purl.org/dc/elements/1.1/description', |
| 12 | + 'format': 'http://purl.org/dc/elements/1.1/format', |
| 13 | + 'identifier': 'http://purl.org/dc/elements/1.1/identifier', |
| 14 | + 'language': 'http://purl.org/dc/elements/1.1/language', |
| 15 | + 'publisher': 'http://purl.org/dc/elements/1.1/publiser', |
| 16 | + 'relation': 'http://purl.org/dc/elements/1.1/relation', |
| 17 | + 'rights': 'http://purl.org/dc/elements/1.1/rights', |
| 18 | + 'source': 'http://purl.org/dc/elements/1.1/source', |
| 19 | + 'subject': 'http://purl.org/dc/elements/1.1/subject', |
| 20 | + 'title': 'http://purl.org/dc/elements/1.1/title', |
| 21 | + 'type': 'http://purl.org/dc/elements/1.1/type', |
| 22 | +} |
| 23 | + |
| 24 | +_DC_TERMS = { # Defined according: http://dublincore.org/documents/2008/01/14/dcmi-terms/ |
| 25 | + 'abstract': 'http://purl.org/dc/terms/abstract', |
| 26 | + 'description': 'http://purl.org/dc/terms/description', |
| 27 | + 'accessrights': 'http://purl.org/dc/terms/accessRights', |
| 28 | + 'rights': 'http://purl.org/dc/terms/rights', |
| 29 | + 'rightsstatement': 'http://purl.org/dc/terms/RightsStatement', |
| 30 | + 'accrualmethod': 'http://purl.org/dc/terms/accrualMethod', |
| 31 | + 'collection': 'http://purl.org/dc/terms/Collection', |
| 32 | + 'methodOfaccrual': 'http://purl.org/dc/terms/MethodOfAccrual', |
| 33 | + 'accrualperiodicity': 'http://purl.org/dc/terms/accrualPeriodicity', |
| 34 | + 'frequency': 'http://purl.org/dc/terms/Frequency', |
| 35 | + 'accrualpolicy': 'http://purl.org/dc/terms/accrualPolicy', |
| 36 | + 'policy': 'http://purl.org/dc/terms/Policy', |
| 37 | + 'alternative': 'http://purl.org/dc/terms/alternative', |
| 38 | + 'title': 'http://purl.org/dc/terms/title', |
| 39 | + 'audience': 'http://purl.org/dc/terms/audience', |
| 40 | + 'agentclass': 'http://purl.org/dc/terms/AgentClass', |
| 41 | + 'available': 'http://purl.org/dc/terms/available', |
| 42 | + 'date': 'http://purl.org/dc/terms/date', |
| 43 | + 'bibliographiccitation': 'http://purl.org/dc/terms/bibliographicCitation', |
| 44 | + 'identifier': 'http://purl.org/dc/terms/identifier', |
| 45 | + 'bibliographicresource': 'http://purl.org/dc/terms/BibliographicResource', |
| 46 | + 'conformsto': 'http://purl.org/dc/terms/conformsTo', |
| 47 | + 'relation': 'http://purl.org/dc/terms/relation', |
| 48 | + 'standard': 'http://purl.org/dc/terms/Standard', |
| 49 | + 'contributor': 'http://purl.org/dc/terms/contributor', |
| 50 | + 'agent': 'http://purl.org/dc/terms/Agent', |
| 51 | + 'coverage': 'http://purl.org/dc/terms/coverage', |
| 52 | + 'locationperiodorjurisdiction': 'http://purl.org/dc/terms/LocationPeriodOrJurisdiction', |
| 53 | + 'created': 'http://purl.org/dc/terms/created', |
| 54 | + 'creator': 'http://purl.org/dc/terms/creator', |
| 55 | + 'dateaccepted': 'http://purl.org/dc/terms/dateAccepted', |
| 56 | + 'datecopyrighted': 'http://purl.org/dc/terms/dateCopyrighted', |
| 57 | + 'datesubmitted': 'http://purl.org/dc/terms/dateSubmitted', |
| 58 | + 'educationlevel': 'http://purl.org/dc/terms/educationLevel', |
| 59 | + 'extent': 'http://purl.org/dc/terms/extent', |
| 60 | + 'format': 'http://purl.org/dc/terms/format', |
| 61 | + 'sizeorduration': 'http://purl.org/dc/terms/SizeOrDuration', |
| 62 | + 'mediatypeorextent': 'http://purl.org/dc/terms/MediaTypeOrExtent', |
| 63 | + 'hasformat': 'http://purl.org/dc/terms/hasFormat', |
| 64 | + 'haspart': 'http://purl.org/dc/terms/hasPart', |
| 65 | + 'hasversion': 'http://purl.org/dc/terms/hasVersion', |
| 66 | + 'instructionalmethod': 'http://purl.org/dc/terms/instructionalMethod', |
| 67 | + 'methodofinstruction': 'http://purl.org/dc/terms/MethodOfInstruction', |
| 68 | + 'isformatof': 'http://purl.org/dc/terms/isFormatOf', |
| 69 | + 'ispartof': 'http://purl.org/dc/terms/isPartOf', |
| 70 | + 'isreferencedby': 'http://purl.org/dc/terms/isReferencedBy', |
| 71 | + 'isreplacedby': 'http://purl.org/dc/terms/isReplacedBy', |
| 72 | + 'isrequiredby': 'http://purl.org/dc/terms/isRequiredBy', |
| 73 | + 'issued': 'http://purl.org/dc/terms/issued', |
| 74 | + 'isversionof': 'http://purl.org/dc/terms/isVersionOf', |
| 75 | + 'language': 'http://purl.org/dc/terms/language', |
| 76 | + 'linguisticsystem': 'http://purl.org/dc/terms/LinguisticSystem', |
| 77 | + 'license': 'http://purl.org/dc/terms/license', |
| 78 | + 'licensedocument': 'http://purl.org/dc/terms/LicenseDocument', |
| 79 | + 'mediator': 'http://purl.org/dc/terms/mediator', |
| 80 | + 'medium': 'http://purl.org/dc/terms/medium', |
| 81 | + 'physicalresource': 'http://purl.org/dc/terms/PhysicalResource', |
| 82 | + 'physicalmedium': 'http://purl.org/dc/terms/PhysicalMedium', |
| 83 | + 'modified': 'http://purl.org/dc/terms/modified', |
| 84 | + 'provenance': 'http://purl.org/dc/terms/provenance', |
| 85 | + 'provenancestatement': 'http://purl.org/dc/terms/ProvenanceStatement', |
| 86 | + 'publisher': 'http://purl.org/dc/terms/publisher', |
| 87 | + 'references': 'http://purl.org/dc/terms/references', |
| 88 | + 'replaces': 'http://purl.org/dc/terms/replaces', |
| 89 | + 'requires': 'http://purl.org/dc/terms/requires', |
| 90 | + 'rightsholder': 'http://purl.org/dc/terms/rightsHolder', |
| 91 | + 'source': 'http://purl.org/dc/terms/source', |
| 92 | + 'spatial': 'http://purl.org/dc/terms/spatial', |
| 93 | + 'location': 'http://purl.org/dc/terms/Location', |
| 94 | + 'subject': 'http://purl.org/dc/terms/subject', |
| 95 | + 'tableofcontents': 'http://purl.org/dc/terms/tableOfContents', |
| 96 | + 'temporal': 'http://purl.org/dc/terms/temporal', |
| 97 | + 'periodoftime': 'http://purl.org/dc/terms/PeriodOfTime', |
| 98 | + 'type': 'http://purl.org/dc/terms/type', |
| 99 | + 'valid': 'http://purl.org/dc/terms/valid', |
| 100 | +} |
| 101 | + |
| 102 | +_URL_NAMESPACES = ['http://purl.org/dc/terms/', 'http://purl.org/dc/elements/1.1/'] |
| 103 | + |
| 104 | + |
| 105 | +def get_lower_attrib(name): |
| 106 | + # get attribute to compare against _DC_TERMS or _DC_ELEMENTS |
| 107 | + return re.sub(r".*\.", "", name).lower() |
| 108 | + |
| 109 | + |
| 110 | +class DublinCoreExtractor(object): |
| 111 | + """DublinCore extractor following extruct API.""" |
| 112 | + |
| 113 | + def extract(self, htmlstring, base_url=None, encoding='UTF-8'): |
| 114 | + tree = parse_html(htmlstring, encoding=encoding) |
| 115 | + return list(self.extract_items(tree, base_url=base_url)) |
| 116 | + |
| 117 | + def extract_items(self, document, base_url=None): |
| 118 | + elements = [] |
| 119 | + terms = [] |
| 120 | + |
| 121 | + def attrib_to_dict(attribs): |
| 122 | + # convert _attrib type to dict |
| 123 | + return dict(attribs.items()) |
| 124 | + |
| 125 | + def populate_results(node, main_attrib): |
| 126 | + # fill list with DC Elements or DC Terms |
| 127 | + node_attrib = node.attrib |
| 128 | + if main_attrib not in node_attrib: |
| 129 | + return |
| 130 | + |
| 131 | + name = node.attrib[main_attrib] |
| 132 | + lower_name = get_lower_attrib(name) |
| 133 | + if lower_name in _DC_ELEMENTS: |
| 134 | + node.attrib.update({'URI': _DC_ELEMENTS[lower_name]}) |
| 135 | + elements.append(attrib_to_dict(node.attrib)) |
| 136 | + |
| 137 | + elif lower_name in _DC_TERMS: |
| 138 | + node.attrib.update({'URI': _DC_TERMS[lower_name]}) |
| 139 | + terms.append(attrib_to_dict(node.attrib)) |
| 140 | + |
| 141 | + namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]') |
| 142 | + namespaces = {} |
| 143 | + for i in namespaces_nodes: |
| 144 | + url = strip_html5_whitespace(i.attrib['href']) |
| 145 | + if url in _URL_NAMESPACES: |
| 146 | + namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): url}) |
| 147 | + |
| 148 | + list_meta_node = document.xpath('//meta') |
| 149 | + for meta_node in list_meta_node: |
| 150 | + populate_results(meta_node, 'name') |
| 151 | + |
| 152 | + list_link_node = document.xpath('//link') |
| 153 | + for link_node in list_link_node: |
| 154 | + populate_results(link_node, 'rel') |
| 155 | + |
| 156 | + yield {'namespaces': namespaces, 'elements': elements, 'terms': terms} |
0 commit comments