Skip to content

Commit 56a960a

Browse files
Remove deprecated methods for Q8_0 tensor loading and conversion to FP32
1 parent 2316ca1 commit 56a960a

File tree

2 files changed

+0
-154
lines changed

2 files changed

+0
-154
lines changed

src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -145,31 +145,6 @@ public static TornadoTensor[] loadArrayOfTornadoTensors(int size, IntFunction<GG
145145
return array;
146146
}
147147

148-
/**
149-
* Load a tensor and manually convert to FP32 (FloatArray).
150-
* Used for embeddings that currently are treated as FP32.
151-
* TODO: it is ultra-slow and should be removed
152-
*/
153-
public static TornadoTensor loadTornadoTensorAsFP32(GGMLTensorEntry entry) {
154-
TornadoTensor tensor = loadTornadoTensor(entry);
155-
return switch (tensor.type()) {
156-
case F32 -> tensor;
157-
case F16 -> {
158-
HalfFloatArray tensorHFA = tensor.asHalfFloatArray();
159-
int numOfElements = tensorHFA.getSize();
160-
FloatArray tensorFA = new FloatArray(numOfElements);
161-
for (int i = 0; i < numOfElements; i++) {
162-
tensorFA.set(i, tensorHFA.get(i).getFloat32());
163-
}
164-
yield new FP32TornadoTensor(tensorFA);
165-
}
166-
case Q8_0 -> Q8_0TornadoTensor.createAsFP32(entry);
167-
default -> {
168-
throw new UnsupportedOperationException("Unsupported tensor type: " + tensor.type());
169-
}
170-
};
171-
}
172-
173148
// Helper methods
174149

175150
public static FloatArray[] loadArrayAsFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
@@ -188,14 +163,6 @@ public static HalfFloatArray[] loadArrayAsHalfFloatArray(int size, IntFunction<G
188163
return array;
189164
}
190165

191-
public static Q8_0TornadoTensor[] loadArrayAsQ8_0TornadoTensor(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
192-
Q8_0TornadoTensor[] array = new Q8_0TornadoTensor[size];
193-
for (int i = 0; i < size; i++) {
194-
array[i] = Q8_0TornadoTensor.createAsQ8_0(getTensorEntry.apply(i));
195-
}
196-
return array;
197-
}
198-
199166
public static FloatArray floatBufferToFloatArray(GGMLTensorEntry tensorEntry) {
200167
if (tensorEntry.ggmlType() == GGMLType.F32) {
201168
FloatBuffer buffer = tensorEntry.memorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();

src/main/java/org/beehive/gpullama3/tensor/tornado/Q8_0TornadoTensor.java

Lines changed: 0 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -94,125 +94,4 @@ public float getFloat(int index) {
9494
return quant * scale;
9595
}
9696

97-
/**
98-
* Creates a Q8_0TornadoTensor from a GGMLTensorEntry (original implementation).
99-
*/
100-
public static Q8_0TornadoTensor createAsQ8_0(GGMLTensorEntry entry) {
101-
if (entry.ggmlType() != GGMLType.Q8_0) {
102-
throw new IllegalArgumentException("Expected Q8_0 tensor, got: " + entry.ggmlType() + " for tensor: " + entry.name());
103-
}
104-
105-
int[] shape = entry.shape();
106-
int size = FloatTensor.numberOfElements(shape);
107-
int numBlocks = size / GGMLType.Q8_0.getBlockSize();
108-
109-
if (size % GGMLType.Q8_0.getBlockSize() != 0) {
110-
throw new IllegalArgumentException("Q8_0 tensor size must be multiple of " + GGMLType.Q8_0.getBlockSize() + ", got: " + size + " for tensor: " + entry.name());
111-
}
112-
113-
// TODO: fix Q8_0 loading in tornado layoyt
114-
// currently we end up to hack it by removing
115-
// tornado header from memory segment
116-
MemorySegment q8Segment = entry.memorySegment().asSlice(TornadoNativeArray.ARRAY_HEADER);
117-
118-
// allocate the arrays for quantized data (int8) and scales (fp16)
119-
HalfFloatArray scales = new HalfFloatArray(numBlocks);
120-
Int8Array quants = new Int8Array(size);
121-
122-
// unpack Q8_0 blocks: [2 bytes fp16 scale][32 bytes int8 quants]
123-
ValueLayout.OfShort shortLayout = ValueLayout.JAVA_SHORT_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
124-
ValueLayout.OfByte byteLayout = ValueLayout.JAVA_BYTE;
125-
126-
// element-wise copy and unpack from MemorySegment to HalfFloatArray scales and Int8Array quants
127-
// use parallel streams and unroll inner loop for better performance
128-
IntStream.range(0, numBlocks)
129-
.parallel()
130-
.forEach(block -> {
131-
// TODO: use GGML type method for the 34L size
132-
long blockOffset = block * 34L; // 34 bytes per block
133-
134-
// read fp16 scale (first 2 bytes of block)
135-
short scaleRaw = q8Segment.get(shortLayout, blockOffset);
136-
scales.set(block, new HalfFloat(scaleRaw));
137-
int blockStart = block * 32;
138-
139-
// read 32 int8 quantized values (remaining bytes of block)
140-
// TODO: use GGML type method for the 32 size
141-
for (int i = 0; i < 32; i += 4) {
142-
// unroll inner loop for better performance
143-
byte q0 = q8Segment.get(byteLayout, blockOffset + 2 + i);
144-
byte q1 = q8Segment.get(byteLayout, blockOffset + 2 + i + 1);
145-
byte q2 = q8Segment.get(byteLayout, blockOffset + 2 + i + 2);
146-
byte q3 = q8Segment.get(byteLayout, blockOffset + 2 + i + 3);
147-
148-
quants.set(blockStart + i, q0);
149-
quants.set(blockStart + i + 1, q1);
150-
quants.set(blockStart + i + 2, q2);
151-
quants.set(blockStart + i + 3, q3);
152-
}
153-
});
154-
155-
return new Q8_0TornadoTensor(size, scales, quants, q8Segment);
156-
}
157-
158-
/**
159-
* Creates a Q8_0TornadoTensor formulated as FP32TornadoTensor object from a GGMLTensorEntry.
160-
* NOTE: Hack implementation to comply with FP32 inference.
161-
*/
162-
public static FP32TornadoTensor createAsFP32(GGMLTensorEntry entry) {
163-
if (entry.ggmlType() != GGMLType.Q8_0) {
164-
throw new IllegalArgumentException("Expected Q8_0 tensor, got: " + entry.ggmlType() + " for tensor: " + entry.name());
165-
}
166-
167-
int[] shape = entry.shape();
168-
int size = FloatTensor.numberOfElements(shape);
169-
int numBlocks = size / GGMLType.Q8_0.getBlockSize();
170-
171-
if (size % GGMLType.Q8_0.getBlockSize() != 0) {
172-
throw new IllegalArgumentException("Q8_0 tensor size must be multiple of " + GGMLType.Q8_0.getBlockSize() + ", got: " + size + " for tensor: " + entry.name());
173-
}
174-
175-
// TODO: fix Q8_0 loading in tornado layoyt
176-
// currently we end up to hack it by removing
177-
// tornado header from memory segment
178-
MemorySegment q8Segment = entry.memorySegment().asSlice(TornadoNativeArray.ARRAY_HEADER);
179-
180-
// allocate the FloatArray to store the result
181-
FloatArray floatArray = new FloatArray(size);
182-
183-
// unpack Q8_0 blocks: [2 bytes fp16 scale][32 bytes int8 quants]
184-
ValueLayout.OfShort shortLayout = ValueLayout.JAVA_SHORT_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
185-
ValueLayout.OfByte byteLayout = ValueLayout.JAVA_BYTE;
186-
187-
// element-wise dequantization and copy from MemorySegment to FloatArray
188-
// use parallel streams and unroll inner loop for better performance
189-
IntStream.range(0, numBlocks)
190-
.parallel()
191-
.forEach(block -> {
192-
// TODO: use GGML type method for the 34L size
193-
long blockOffset = block * 34L; // 34 bytes per block
194-
195-
// read fp16 scale (first 2 bytes of block) and convert to float
196-
short scaleRaw = q8Segment.get(shortLayout, blockOffset);
197-
float scale = Float.float16ToFloat(scaleRaw);
198-
int blockStart = block * 32;
199-
200-
// read 32 int8 quantized values (remaining bytes of block)
201-
// TODO: use GGML type method for the 32 size
202-
for (int i = 0; i < 32; i += 4) {
203-
// unroll inner loop for better performance
204-
byte q0 = q8Segment.get(byteLayout, blockOffset + 2 + i);
205-
byte q1 = q8Segment.get(byteLayout, blockOffset + 2 + i + 1);
206-
byte q2 = q8Segment.get(byteLayout, blockOffset + 2 + i + 2);
207-
byte q3 = q8Segment.get(byteLayout, blockOffset + 2 + i + 3);
208-
209-
floatArray.set(blockStart + i, q0 * scale);
210-
floatArray.set(blockStart + i + 1, q1 * scale);
211-
floatArray.set(blockStart + i + 2, q2 * scale);
212-
floatArray.set(blockStart + i + 3, q3 * scale);
213-
}
214-
});
215-
216-
return new FP32TornadoTensor(floatArray);
217-
}
21897
}

0 commit comments

Comments
 (0)