Skip to content

Commit 99938f8

Browse files
committed
fix: update resource estimation calculation
1 parent 2824471 commit 99938f8

File tree

4 files changed

+71
-55
lines changed

4 files changed

+71
-55
lines changed

src/gguf/insights/GgufInsights.ts

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,8 +382,24 @@ export class GgufInsights {
382382
const cpuKVCacheSize = this._estimateKvMemorySizeInBytes(kvSize, finalCpuLayers);
383383

384384
// source: `llama_context::graph_max_nodes` in `llama-context.cpp`
385-
const maxNodes = Math.max(65536, 5 * tensorInfo.length);
386-
const cpuNodes = 5 * (tensorInfo.length * (finalCpuLayers / totalFileLayers));
385+
const getMaxNodesMultiplier = (arch: GgufArchitectureType | undefined, nTokens: number): {min: number, multiplier: number} => {
386+
if (arch === GgufArchitectureType.qwen3next)
387+
return {
388+
min: nTokens * 40,
389+
multiplier: 32
390+
};
391+
392+
return {
393+
min: 1024,
394+
multiplier: 8
395+
};
396+
};
397+
const maxNodesMultiplier = getMaxNodesMultiplier(
398+
this._ggufFileInfo.metadata?.general?.architecture,
399+
Math.min(actualContextSize, batchSize)
400+
);
401+
const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * tensorInfo.length);
402+
const cpuNodes = maxNodesMultiplier.multiplier * (tensorInfo.length * (finalCpuLayers / totalFileLayers));
387403
const gpuNodes = maxNodes - cpuNodes;
388404

389405
const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) +

test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ describe("functionary", () => {
255255
freeRam: s1GB * 4.5
256256
});
257257
expect(res.gpuLayers).to.eql(16);
258-
expect(res.contextSize).to.toMatchInlineSnapshot("3840");
258+
expect(res.contextSize).to.toMatchInlineSnapshot("4096");
259259
}
260260
try {
261261
await resolveGpuLayers(16, {
@@ -343,7 +343,7 @@ describe("functionary", () => {
343343
unifiedMemorySize: s1GB * 7.3
344344
});
345345
expect(res.gpuLayers).to.eql(16);
346-
expect(res.contextSize).to.toMatchInlineSnapshot("1536");
346+
expect(res.contextSize).to.toMatchInlineSnapshot("1792");
347347
}
348348
{
349349
const res = await resolveGpuLayers(16, {
@@ -820,7 +820,7 @@ describe("functionary", () => {
820820
unifiedMemorySize: s1GB * 6
821821
});
822822
expect(res.gpuLayers).to.eql(33);
823-
expect(res.contextSize).to.toMatchInlineSnapshot("2816");
823+
expect(res.contextSize).to.toMatchInlineSnapshot("3072");
824824
}
825825
{
826826
const res = await resolveGpuLayers(33, {
@@ -908,7 +908,7 @@ describe("functionary", () => {
908908
freeRam: s1GB * 1
909909
});
910910
expect(res.gpuLayers).to.eql(33);
911-
expect(res.contextSize).to.toMatchInlineSnapshot("458");
911+
expect(res.contextSize).to.toMatchInlineSnapshot("501");
912912
}
913913
{
914914
const res = await resolveGpuLayers("max", {
@@ -918,7 +918,7 @@ describe("functionary", () => {
918918
freeRam: s1GB * 1
919919
});
920920
expect(res.gpuLayers).to.eql(33);
921-
expect(res.contextSize).to.toMatchInlineSnapshot("768");
921+
expect(res.contextSize).to.toMatchInlineSnapshot("1024");
922922
}
923923
});
924924

@@ -962,7 +962,7 @@ describe("functionary", () => {
962962
freeRam: s1GB * 8
963963
});
964964
expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
965-
expect(res.contextSize).to.toMatchInlineSnapshot("7424");
965+
expect(res.contextSize).to.toMatchInlineSnapshot("7936");
966966
}
967967
{
968968
const res = await resolveGpuLayers("auto", {
@@ -1125,7 +1125,7 @@ describe("functionary", () => {
11251125
freeRam: s1GB * 5
11261126
});
11271127
expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
1128-
expect(res.contextSize).to.toMatchInlineSnapshot("7424");
1128+
expect(res.contextSize).to.toMatchInlineSnapshot("7936");
11291129
}
11301130
{
11311131
const res = await resolveGpuLayers("auto", {
@@ -1349,7 +1349,7 @@ describe("functionary", () => {
13491349
expect(res.gpuLayers).to.be.gte(16);
13501350
expect(res.gpuLayers).to.be.lte(24);
13511351
expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
1352-
expect(res.contextSize).to.toMatchInlineSnapshot("3840");
1352+
expect(res.contextSize).to.toMatchInlineSnapshot("4096");
13531353
}
13541354
});
13551355

@@ -1451,7 +1451,7 @@ describe("functionary", () => {
14511451
expect(res.gpuLayers).to.be.gte(16);
14521452
expect(res.gpuLayers).to.be.lte(24);
14531453
expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
1454-
expect(res.contextSize).to.toMatchInlineSnapshot("3840");
1454+
expect(res.contextSize).to.toMatchInlineSnapshot("4096");
14551455
}
14561456
});
14571457
});
@@ -1479,8 +1479,8 @@ describe("functionary", () => {
14791479
totalRam: s1GB * 8,
14801480
freeRam: s1GB * 8
14811481
});
1482-
expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
1483-
expect(res.contextSize).to.toMatchInlineSnapshot("6400");
1482+
expect(res.gpuLayers).to.toMatchInlineSnapshot("22");
1483+
expect(res.contextSize).to.toMatchInlineSnapshot("5376");
14841484
expect(res.contextSize).to.be.gte(contextSize);
14851485
}
14861486
{
@@ -1492,7 +1492,7 @@ describe("functionary", () => {
14921492
freeRam: s1GB * 8
14931493
});
14941494
expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
1495-
expect(res.contextSize).to.toMatchInlineSnapshot("7424");
1495+
expect(res.contextSize).to.toMatchInlineSnapshot("7936");
14961496
expect(res.contextSize).to.be.gte(contextSize);
14971497
}
14981498
{
@@ -1569,7 +1569,7 @@ describe("functionary", () => {
15691569
freeRam: s1GB * 7
15701570
});
15711571
expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
1572-
expect(res.contextSize).to.toMatchInlineSnapshot("6400");
1572+
expect(res.contextSize).to.toMatchInlineSnapshot("6656");
15731573
expect(res.contextSize).to.be.gte(contextSize);
15741574
}
15751575
{
@@ -1581,7 +1581,7 @@ describe("functionary", () => {
15811581
freeRam: s1GB * 7
15821582
});
15831583
expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
1584-
expect(res.contextSize).to.toMatchInlineSnapshot("7424");
1584+
expect(res.contextSize).to.toMatchInlineSnapshot("7936");
15851585
expect(res.contextSize).to.be.gte(contextSize);
15861586
}
15871587
{

test/modelDependent/functionary/gguf/ggufInsights.test.ts

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ describe("gguf", async () => {
124124
sequences: context.totalSequences,
125125
modelGpuLayers: ggufInsights.totalLayers
126126
}).gpuVram;
127-
expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot("\"1.03GB\"");
127+
expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot('"1GB"');
128128
expect(Math.abs(contextVramUsageDiff - estimatedContextVramUsage)).to.be.lte(s330MB);
129129

130130
await model.dispose();
@@ -190,7 +190,7 @@ describe("gguf", async () => {
190190
batchSize: 512
191191
}))).toMatchInlineSnapshot(`
192192
{
193-
"cpuRam": "643.08MB",
193+
"cpuRam": "643.45MB",
194194
"gpuVram": "0B",
195195
}
196196
`);
@@ -201,7 +201,7 @@ describe("gguf", async () => {
201201
batchSize: 512
202202
}))).toMatchInlineSnapshot(`
203203
{
204-
"cpuRam": "451.08MB",
204+
"cpuRam": "451.45MB",
205205
"gpuVram": "0B",
206206
}
207207
`);
@@ -214,7 +214,7 @@ describe("gguf", async () => {
214214
}))).toMatchInlineSnapshot(`
215215
{
216216
"cpuRam": "1.71GB",
217-
"gpuVram": "355.75MB",
217+
"gpuVram": "330.78MB",
218218
}
219219
`);
220220
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -224,8 +224,8 @@ describe("gguf", async () => {
224224
batchSize: 512
225225
}))).toMatchInlineSnapshot(`
226226
{
227-
"cpuRam": "1002.82MB",
228-
"gpuVram": "315.75MB",
227+
"cpuRam": "1003.17MB",
228+
"gpuVram": "290.78MB",
229229
}
230230
`);
231231
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -235,8 +235,8 @@ describe("gguf", async () => {
235235
batchSize: 512
236236
}))).toMatchInlineSnapshot(`
237237
{
238-
"cpuRam": "630.82MB",
239-
"gpuVram": "295.75MB",
238+
"cpuRam": "631.17MB",
239+
"gpuVram": "270.78MB",
240240
}
241241
`);
242242
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -246,8 +246,8 @@ describe("gguf", async () => {
246246
batchSize: 512
247247
}))).toMatchInlineSnapshot(`
248248
{
249-
"cpuRam": "444.82MB",
250-
"gpuVram": "285.75MB",
249+
"cpuRam": "445.17MB",
250+
"gpuVram": "260.78MB",
251251
}
252252
`);
253253

@@ -258,8 +258,8 @@ describe("gguf", async () => {
258258
batchSize: 512
259259
}))).toMatchInlineSnapshot(`
260260
{
261-
"cpuRam": "1022.79MB",
262-
"gpuVram": "1.05GB",
261+
"cpuRam": "1022.98MB",
262+
"gpuVram": "1.03GB",
263263
}
264264
`);
265265
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -269,8 +269,8 @@ describe("gguf", async () => {
269269
batchSize: 512
270270
}))).toMatchInlineSnapshot(`
271271
{
272-
"cpuRam": "638.79MB",
273-
"gpuVram": "679.75MB",
272+
"cpuRam": "638.98MB",
273+
"gpuVram": "654.98MB",
274274
}
275275
`);
276276
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -280,8 +280,8 @@ describe("gguf", async () => {
280280
batchSize: 512
281281
}))).toMatchInlineSnapshot(`
282282
{
283-
"cpuRam": "446.79MB",
284-
"gpuVram": "479.75MB",
283+
"cpuRam": "446.98MB",
284+
"gpuVram": "454.98MB",
285285
}
286286
`);
287287
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -291,8 +291,8 @@ describe("gguf", async () => {
291291
batchSize: 512
292292
}))).toMatchInlineSnapshot(`
293293
{
294-
"cpuRam": "350.79MB",
295-
"gpuVram": "379.75MB",
294+
"cpuRam": "350.98MB",
295+
"gpuVram": "354.98MB",
296296
}
297297
`);
298298

@@ -304,7 +304,7 @@ describe("gguf", async () => {
304304
}))).toMatchInlineSnapshot(`
305305
{
306306
"cpuRam": "250.5MB",
307-
"gpuVram": "1.78GB",
307+
"gpuVram": "1.75GB",
308308
}
309309
`);
310310
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -315,7 +315,7 @@ describe("gguf", async () => {
315315
}))).toMatchInlineSnapshot(`
316316
{
317317
"cpuRam": "250.5MB",
318-
"gpuVram": "1.03GB",
318+
"gpuVram": "1GB",
319319
}
320320
`);
321321
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -326,7 +326,7 @@ describe("gguf", async () => {
326326
}))).toMatchInlineSnapshot(`
327327
{
328328
"cpuRam": "250.5MB",
329-
"gpuVram": "668.02MB",
329+
"gpuVram": "643.45MB",
330330
}
331331
`);
332332
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -337,7 +337,7 @@ describe("gguf", async () => {
337337
}))).toMatchInlineSnapshot(`
338338
{
339339
"cpuRam": "250.5MB",
340-
"gpuVram": "476.02MB",
340+
"gpuVram": "451.45MB",
341341
}
342342
`);
343343

@@ -349,7 +349,7 @@ describe("gguf", async () => {
349349
}))).toMatchInlineSnapshot(`
350350
{
351351
"cpuRam": "250.5MB",
352-
"gpuVram": "1.78GB",
352+
"gpuVram": "1.75GB",
353353
}
354354
`);
355355
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -360,7 +360,7 @@ describe("gguf", async () => {
360360
}))).toMatchInlineSnapshot(`
361361
{
362362
"cpuRam": "250.5MB",
363-
"gpuVram": "1.03GB",
363+
"gpuVram": "1GB",
364364
}
365365
`);
366366
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -371,7 +371,7 @@ describe("gguf", async () => {
371371
}))).toMatchInlineSnapshot(`
372372
{
373373
"cpuRam": "250.5MB",
374-
"gpuVram": "668.02MB",
374+
"gpuVram": "643.45MB",
375375
}
376376
`);
377377
expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -382,7 +382,7 @@ describe("gguf", async () => {
382382
}))).toMatchInlineSnapshot(`
383383
{
384384
"cpuRam": "250.5MB",
385-
"gpuVram": "476.02MB",
385+
"gpuVram": "451.45MB",
386386
}
387387
`);
388388
});

0 commit comments

Comments
 (0)