Skip to content

Commit d171b39

Browse files
authored
Refactor EurostatData to improve dataset handling and add theme selection functionality (#1005)
* Refactor EurostatData to improve dataset handling and add theme selection functionality * Show download progress * Refactor EurostatData to streamline theme and subtheme selection, enhancing dataset handling and user interaction * Add initial meta data processing scripts and configuration files * Fix test * Optimize JSONStreamParser to improve performance by adjusting the frequency of yielding control during parsing based on time and count thresholds. * Improve streaming and progress bar
1 parent 0c837d8 commit d171b39

File tree

9 files changed

+584
-71
lines changed

9 files changed

+584
-71
lines changed

js/data/eurostat.js

Lines changed: 393 additions & 60 deletions
Large diffs are not rendered by default.

js/data/meta/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
node_modules

js/data/meta/catalogue.json.gz

115 KB
Binary file not shown.

js/data/meta/metabase.json.gz

1.06 MB
Binary file not shown.

js/data/meta/package-lock.json

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/data/meta/package.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"type": "module",
3+
"devDependencies": {
4+
"fast-xml-parser": "^5.2.1"
5+
}
6+
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import fs from 'fs/promises'
2+
import path from 'path'
3+
import { XMLParser } from 'fast-xml-parser'
4+
5+
const basePath = import.meta.dirname
6+
7+
const fetchProgress = async (input, init) => {
8+
let waitTime = 0.01
9+
const units = ['B', 'KiB', 'MiB', 'GiB']
10+
for (let i = 0; i < 100; i++) {
11+
try {
12+
const response = await fetch(input, init)
13+
const total = response.headers.get('content-length')
14+
let loaded = 0
15+
let prevPrintTime = 0
16+
17+
const progressdStream = new TransformStream({
18+
transform(chunk, controller) {
19+
controller.enqueue(chunk)
20+
loaded += chunk.byteLength
21+
const now = Date.now()
22+
if (now - prevPrintTime > 1000) {
23+
prevPrintTime = now
24+
if (total) {
25+
console.info(loaded + ' / ' + total + ' bytes')
26+
} else {
27+
let uidx = 0
28+
let val = loaded
29+
while (val > 1000 && uidx + 1 < units.length) {
30+
val /= 1024
31+
uidx++
32+
}
33+
console.info(Math.floor(val * 100) / 100 + ' ' + units[uidx])
34+
}
35+
}
36+
},
37+
})
38+
39+
return new Response(response.body.pipeThrough(progressdStream))
40+
} catch {
41+
await new Promise(resolve => setTimeout(resolve, waitTime * 1000))
42+
waitTime = Math.min(waitTime * 2, 5)
43+
}
44+
}
45+
return null
46+
}
47+
48+
const getCatalogue = async () => {
49+
console.log('Fetching catalogue...')
50+
const res = await fetchProgress('https://ec.europa.eu/eurostat/api/dissemination/catalogue/toc/xml')
51+
if (!res) {
52+
throw new Error('Failed to fetch catalogue')
53+
}
54+
const catalogues = await res.text()
55+
const parser = new XMLParser({ ignoreAttributes: false })
56+
const doc = parser.parse(catalogues, 'application/xml')
57+
const themes = doc['nt:tree']['nt:branch'][0]
58+
59+
const getLeafs = node => {
60+
const obj = {
61+
title: node['nt:title'][0]['#text'],
62+
children: [],
63+
}
64+
const branch = node['nt:children']['nt:branch']
65+
if (branch) {
66+
for (const b of Array.isArray(branch) ? branch : [branch]) {
67+
obj.children.push(getLeafs(b))
68+
}
69+
} else {
70+
const leafs = node['nt:children']['nt:leaf']
71+
for (const leaf of Array.isArray(leafs) ? leafs : [leafs]) {
72+
obj.children.push({
73+
title: leaf['nt:title'][0]['#text'],
74+
code: leaf['nt:code'],
75+
})
76+
}
77+
}
78+
return obj
79+
}
80+
81+
const root = getLeafs(themes)
82+
83+
const readableStream = new Blob([JSON.stringify(root)]).stream()
84+
const compressedStream = readableStream.pipeThrough(new CompressionStream('gzip'))
85+
const arrayBuffer = await new Response(compressedStream).arrayBuffer()
86+
fs.writeFile(path.join(basePath, 'catalogue.json.gz'), Buffer.from(arrayBuffer))
87+
}
88+
89+
const getMetabase = async () => {
90+
console.log('Fetching metabase...')
91+
let res = await fetchProgress('https://ec.europa.eu/eurostat/api/dissemination/catalogue/metabase.txt.gz')
92+
if (!res) {
93+
throw new Error('Failed to fetch metadata')
94+
}
95+
const decompressedStream = res.body.pipeThrough(new DecompressionStream('gzip'))
96+
const text = await new Response(decompressedStream).text()
97+
98+
const data = {}
99+
for (const line of text.split('\n')) {
100+
if (line.trim().length === 0) {
101+
continue
102+
}
103+
const [id, unit, value] = line.split('\t')
104+
if (!data[id]) {
105+
data[id] = {}
106+
}
107+
if (!data[id][unit]) {
108+
data[id][unit] = []
109+
}
110+
data[id][unit].push(value)
111+
}
112+
113+
const readableStream = new Blob([JSON.stringify(data)]).stream()
114+
const compressedStream = readableStream.pipeThrough(new CompressionStream('gzip'))
115+
const arrayBuffer = await new Response(compressedStream).arrayBuffer()
116+
fs.writeFile(path.join(basePath, 'metabase.json.gz'), Buffer.from(arrayBuffer))
117+
}
118+
119+
await getMetabase()
120+
await getCatalogue()

js/data/util/ioselector.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,13 @@ export default class IOSelector {
133133
}
134134
}
135135

136+
clear() {
137+
this._r.replaceChildren()
138+
this._columns = []
139+
this._object = []
140+
this._target = -1
141+
}
142+
136143
terminate() {
137144
this._r.remove()
138145
}

tests/gui/data/eurostat.test.js

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,30 @@ describe('classification', () => {
66
let page
77
beforeEach(async () => {
88
page = await getPage()
9+
const dataSelectBox = page.locator('#ml_selector dl:first-child dd:nth-child(2) select')
10+
await dataSelectBox.selectOption('eurostat')
911
})
1012

1113
afterEach(async () => {
1214
await page?.close()
1315
})
1416

1517
test('initialize', async () => {
16-
const dataSelectBox = await page.waitForSelector('#ml_selector dl:first-child dd:nth-child(2) select')
17-
await dataSelectBox.selectOption('eurostat')
18+
const dataMenu = page.locator('#ml_selector #data_menu')
19+
const themeList = dataMenu.locator('div:first-child')
1820

19-
const dataMenu = await page.waitForSelector('#ml_selector #data_menu')
20-
const nameTextBox = await dataMenu.waitForSelector('select[name=name]')
21-
const name = await (await nameTextBox.getProperty('value')).jsonValue()
22-
expect(name).toBe('Population and employment')
23-
24-
const svg = await page.waitForSelector('#plot-area svg')
25-
await svg.waitForSelector('.points .datas circle')
26-
const size = (await svg.$$('.points .datas circle')).length
21+
const svg = page.locator('#plot-area svg')
22+
await svg.locator('.points .datas circle').first().waitFor()
23+
const size = await svg.locator('.points .datas circle').count()
2724
expect(size).toBeGreaterThan(0)
2825

29-
const aiManager = await getaimanager(page)
26+
const aiManager = await getaimanager(page, {
27+
ignoreProperties: ['_catalogue', '_metabase'],
28+
})
3029
expect(aiManager._datas).toBeDefined()
3130
expect(aiManager._datas._x.length).toBe(size)
31+
32+
const nameTextBox = themeList.locator('select').last()
33+
await expect(nameTextBox.inputValue()).resolves.toBe('nama_10_pe')
3234
})
3335
})

0 commit comments

Comments
 (0)