Skip to content

Commit bfa9d4a

Browse files
committed
Add initial meta data processing scripts and configuration files
1 parent 42e3072 commit bfa9d4a

File tree

6 files changed

+200
-0
lines changed

6 files changed

+200
-0
lines changed

js/data/meta/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
node_modules

js/data/meta/catalogue.json.gz

115 KB
Binary file not shown.

js/data/meta/metabase.json.gz

1.06 MB
Binary file not shown.

js/data/meta/package-lock.json

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/data/meta/package.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"type": "module",
3+
"devDependencies": {
4+
"fast-xml-parser": "^5.2.1"
5+
}
6+
}
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import fs from 'fs/promises'
2+
import path from 'path'
3+
import { XMLParser } from 'fast-xml-parser'
4+
5+
const basePath = import.meta.dirname
6+
7+
const fetchProgress = async (input, init) => {
8+
const response = await fetch(input, init)
9+
let loaded = 0
10+
let total = response.headers.get('content-length')
11+
let prevPrintTime = 0
12+
const units = ['B', 'KiB', 'MiB', 'GiB']
13+
14+
return new Response(
15+
new ReadableStream({
16+
pull: async controller => {
17+
for await (const chunk of response.body) {
18+
controller.enqueue(chunk)
19+
loaded += chunk.byteLength
20+
const now = Date.now()
21+
if (now - prevPrintTime > 1000) {
22+
prevPrintTime = now
23+
if (total) {
24+
console.info(loaded + ' / ' + total + ' bytes')
25+
} else {
26+
let uidx = 0
27+
let val = loaded
28+
while (val > 1000 && uidx + 1 < units.length) {
29+
val /= 1024
30+
uidx++
31+
}
32+
console.info(Math.floor(val * 100) / 100 + ' ' + units[uidx])
33+
}
34+
}
35+
}
36+
controller.close()
37+
},
38+
})
39+
)
40+
}
41+
42+
const getCatalogue = async () => {
43+
console.log('Fetching catalogue...')
44+
let res = null
45+
let waitTime = 0.5
46+
for (let i = 0; i < 100; i++) {
47+
try {
48+
res = await fetchProgress('https://ec.europa.eu/eurostat/api/dissemination/catalogue/toc/xml')
49+
break
50+
} catch (error) {
51+
await new Promise(resolve => setTimeout(resolve, waitTime))
52+
waitTime = Math.min(waitTime * 2, 60)
53+
}
54+
}
55+
if (!res) {
56+
throw new Error('Failed to fetch catalogue')
57+
}
58+
const catalogues = await res.text()
59+
const parser = new XMLParser({
60+
ignoreAttributes: false,
61+
})
62+
const doc = parser.parse(catalogues, 'application/xml')
63+
const themes = doc['nt:tree']['nt:branch'][0]
64+
65+
const root = {}
66+
const getLeafs = (node, depth, obj) => {
67+
obj.title = node['nt:title'][0]['#text']
68+
obj.children = []
69+
const branch = node['nt:children']['nt:branch']
70+
if (branch) {
71+
if (Array.isArray(branch)) {
72+
for (const b of branch) {
73+
const c = {}
74+
obj.children.push(c)
75+
getLeafs(b, depth + 1, c)
76+
}
77+
} else {
78+
const c = {}
79+
obj.children.push(c)
80+
getLeafs(branch, depth + 1, c)
81+
}
82+
} else {
83+
const leafs = node['nt:children']['nt:leaf']
84+
if (Array.isArray(leafs)) {
85+
for (const leaf of leafs) {
86+
obj.children.push({
87+
title: leaf['nt:title'][0]['#text'],
88+
code: leaf['nt:code'],
89+
})
90+
}
91+
} else {
92+
obj.children.push({
93+
title: leafs['nt:title'][0]['#text'],
94+
code: leafs['nt:code'],
95+
})
96+
}
97+
}
98+
}
99+
100+
getLeafs(themes, 0, root)
101+
102+
const readableStream = new Blob([JSON.stringify(root)]).stream()
103+
const compressedStream = readableStream.pipeThrough(new CompressionStream('gzip'))
104+
const arrayBuffer = await new Response(compressedStream).arrayBuffer()
105+
fs.writeFile(path.join(basePath, 'catalogue.json.gz'), Buffer.from(arrayBuffer))
106+
}
107+
108+
const getMetabase = async () => {
109+
console.log('Fetching metabase...')
110+
let res = null
111+
let waitTime = 0.5
112+
for (let i = 0; i < 100; i++) {
113+
try {
114+
res = await fetchProgress('https://ec.europa.eu/eurostat/api/dissemination/catalogue/metabase.txt.gz')
115+
break
116+
} catch (error) {
117+
await new Promise(resolve => setTimeout(resolve, waitTime))
118+
waitTime = Math.min(waitTime * 2, 60)
119+
}
120+
}
121+
if (!res) {
122+
throw new Error('Failed to fetch metadata')
123+
}
124+
const decompressedStream = res.body.pipeThrough(new DecompressionStream('gzip'))
125+
const text = await new Response(decompressedStream).text()
126+
127+
const data = {}
128+
for (const line of text.split('\n')) {
129+
if (line.trim() === '') {
130+
continue
131+
}
132+
const [id, unit, value] = line.split('\t')
133+
if (!data[id]) {
134+
data[id] = {}
135+
}
136+
if (!data[id][unit]) {
137+
data[id][unit] = []
138+
}
139+
data[id][unit].push(value)
140+
}
141+
142+
const readableStream = new Blob([JSON.stringify(data)]).stream()
143+
const compressedStream = readableStream.pipeThrough(new CompressionStream('gzip'))
144+
const arrayBuffer = await new Response(compressedStream).arrayBuffer()
145+
fs.writeFile(path.join(basePath, 'metabase.json.gz'), Buffer.from(arrayBuffer))
146+
}
147+
148+
await getMetabase()
149+
await getCatalogue()

0 commit comments

Comments
 (0)