@@ -94,125 +94,4 @@ public float getFloat(int index) {
9494 return quant * scale ;
9595 }
9696
97- /**
98- * Creates a Q8_0TornadoTensor from a GGMLTensorEntry (original implementation).
99- */
100- public static Q8_0TornadoTensor createAsQ8_0 (GGMLTensorEntry entry ) {
101- if (entry .ggmlType () != GGMLType .Q8_0 ) {
102- throw new IllegalArgumentException ("Expected Q8_0 tensor, got: " + entry .ggmlType () + " for tensor: " + entry .name ());
103- }
104-
105- int [] shape = entry .shape ();
106- int size = FloatTensor .numberOfElements (shape );
107- int numBlocks = size / GGMLType .Q8_0 .getBlockSize ();
108-
109- if (size % GGMLType .Q8_0 .getBlockSize () != 0 ) {
110- throw new IllegalArgumentException ("Q8_0 tensor size must be multiple of " + GGMLType .Q8_0 .getBlockSize () + ", got: " + size + " for tensor: " + entry .name ());
111- }
112-
113- // TODO: fix Q8_0 loading in tornado layoyt
114- // currently we end up to hack it by removing
115- // tornado header from memory segment
116- MemorySegment q8Segment = entry .memorySegment ().asSlice (TornadoNativeArray .ARRAY_HEADER );
117-
118- // allocate the arrays for quantized data (int8) and scales (fp16)
119- HalfFloatArray scales = new HalfFloatArray (numBlocks );
120- Int8Array quants = new Int8Array (size );
121-
122- // unpack Q8_0 blocks: [2 bytes fp16 scale][32 bytes int8 quants]
123- ValueLayout .OfShort shortLayout = ValueLayout .JAVA_SHORT_UNALIGNED .withOrder (ByteOrder .LITTLE_ENDIAN );
124- ValueLayout .OfByte byteLayout = ValueLayout .JAVA_BYTE ;
125-
126- // element-wise copy and unpack from MemorySegment to HalfFloatArray scales and Int8Array quants
127- // use parallel streams and unroll inner loop for better performance
128- IntStream .range (0 , numBlocks )
129- .parallel ()
130- .forEach (block -> {
131- // TODO: use GGML type method for the 34L size
132- long blockOffset = block * 34L ; // 34 bytes per block
133-
134- // read fp16 scale (first 2 bytes of block)
135- short scaleRaw = q8Segment .get (shortLayout , blockOffset );
136- scales .set (block , new HalfFloat (scaleRaw ));
137- int blockStart = block * 32 ;
138-
139- // read 32 int8 quantized values (remaining bytes of block)
140- // TODO: use GGML type method for the 32 size
141- for (int i = 0 ; i < 32 ; i += 4 ) {
142- // unroll inner loop for better performance
143- byte q0 = q8Segment .get (byteLayout , blockOffset + 2 + i );
144- byte q1 = q8Segment .get (byteLayout , blockOffset + 2 + i + 1 );
145- byte q2 = q8Segment .get (byteLayout , blockOffset + 2 + i + 2 );
146- byte q3 = q8Segment .get (byteLayout , blockOffset + 2 + i + 3 );
147-
148- quants .set (blockStart + i , q0 );
149- quants .set (blockStart + i + 1 , q1 );
150- quants .set (blockStart + i + 2 , q2 );
151- quants .set (blockStart + i + 3 , q3 );
152- }
153- });
154-
155- return new Q8_0TornadoTensor (size , scales , quants , q8Segment );
156- }
157-
158- /**
159- * Creates a Q8_0TornadoTensor formulated as FP32TornadoTensor object from a GGMLTensorEntry.
160- * NOTE: Hack implementation to comply with FP32 inference.
161- */
162- public static FP32TornadoTensor createAsFP32 (GGMLTensorEntry entry ) {
163- if (entry .ggmlType () != GGMLType .Q8_0 ) {
164- throw new IllegalArgumentException ("Expected Q8_0 tensor, got: " + entry .ggmlType () + " for tensor: " + entry .name ());
165- }
166-
167- int [] shape = entry .shape ();
168- int size = FloatTensor .numberOfElements (shape );
169- int numBlocks = size / GGMLType .Q8_0 .getBlockSize ();
170-
171- if (size % GGMLType .Q8_0 .getBlockSize () != 0 ) {
172- throw new IllegalArgumentException ("Q8_0 tensor size must be multiple of " + GGMLType .Q8_0 .getBlockSize () + ", got: " + size + " for tensor: " + entry .name ());
173- }
174-
175- // TODO: fix Q8_0 loading in tornado layoyt
176- // currently we end up to hack it by removing
177- // tornado header from memory segment
178- MemorySegment q8Segment = entry .memorySegment ().asSlice (TornadoNativeArray .ARRAY_HEADER );
179-
180- // allocate the FloatArray to store the result
181- FloatArray floatArray = new FloatArray (size );
182-
183- // unpack Q8_0 blocks: [2 bytes fp16 scale][32 bytes int8 quants]
184- ValueLayout .OfShort shortLayout = ValueLayout .JAVA_SHORT_UNALIGNED .withOrder (ByteOrder .LITTLE_ENDIAN );
185- ValueLayout .OfByte byteLayout = ValueLayout .JAVA_BYTE ;
186-
187- // element-wise dequantization and copy from MemorySegment to FloatArray
188- // use parallel streams and unroll inner loop for better performance
189- IntStream .range (0 , numBlocks )
190- .parallel ()
191- .forEach (block -> {
192- // TODO: use GGML type method for the 34L size
193- long blockOffset = block * 34L ; // 34 bytes per block
194-
195- // read fp16 scale (first 2 bytes of block) and convert to float
196- short scaleRaw = q8Segment .get (shortLayout , blockOffset );
197- float scale = Float .float16ToFloat (scaleRaw );
198- int blockStart = block * 32 ;
199-
200- // read 32 int8 quantized values (remaining bytes of block)
201- // TODO: use GGML type method for the 32 size
202- for (int i = 0 ; i < 32 ; i += 4 ) {
203- // unroll inner loop for better performance
204- byte q0 = q8Segment .get (byteLayout , blockOffset + 2 + i );
205- byte q1 = q8Segment .get (byteLayout , blockOffset + 2 + i + 1 );
206- byte q2 = q8Segment .get (byteLayout , blockOffset + 2 + i + 2 );
207- byte q3 = q8Segment .get (byteLayout , blockOffset + 2 + i + 3 );
208-
209- floatArray .set (blockStart + i , q0 * scale );
210- floatArray .set (blockStart + i + 1 , q1 * scale );
211- floatArray .set (blockStart + i + 2 , q2 * scale );
212- floatArray .set (blockStart + i + 3 , q3 * scale );
213- }
214- });
215-
216- return new FP32TornadoTensor (floatArray );
217- }
21897}
0 commit comments