|
38 | 38 | import org.apache.flink.streaming.api.operators.Output; |
39 | 39 | import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; |
40 | 40 | import org.apache.flink.streaming.runtime.tasks.StreamTask; |
| 41 | + |
41 | 42 | import org.apache.hudi.common.util.Functions; |
42 | 43 | import org.apache.hudi.common.util.hash.BucketIndexUtil; |
43 | 44 | import org.apache.hudi.configuration.FlinkOptions; |
@@ -174,9 +175,9 @@ public void processElement(StreamRecord<Event> streamRecord) throws Exception { |
174 | 175 | } |
175 | 176 |
|
176 | 177 | /** |
177 | | - * Calculate which task index should handle this event by: |
178 | | - * 1. Calculating the bucket number (0 to numBuckets-1) based on record key |
179 | | - * 2. Using partitionIndexFunc to map bucket -> task index for balanced distribution |
| 178 | + * Calculate which task index should handle this event by: 1. Calculating the bucket number (0 |
| 179 | + * to numBuckets-1) based on record key 2. Using partitionIndexFunc to map bucket -> task index |
| 180 | + * for balanced distribution |
180 | 181 | * |
181 | 182 | * @param event The DataChangeEvent to calculate task index for |
182 | 183 | * @return The task index (0 to parallelism-1) that should handle this event |
@@ -252,9 +253,10 @@ private int calculateTaskIndex(DataChangeEvent event) { |
252 | 253 | String tableIndexKeyFields = String.join(",", primaryKeys); |
253 | 254 | int bucketNumber = BucketIdentifier.getBucketId(recordKey, tableIndexKeyFields, numBuckets); |
254 | 255 |
|
| 256 | + // Extract partition path from the event |
| 257 | + String partition = extractPartitionPath(event, finalSchema, fieldGetters); |
| 258 | + |
255 | 259 | // Use partition function to map bucket to task index for balanced distribution |
256 | | - // partition is "default" since we're not using Hudi partition fields in this context |
257 | | - String partition = "default"; |
258 | 260 | int taskIndex = partitionIndexFunc.apply(numBuckets, partition, bucketNumber); |
259 | 261 |
|
260 | 262 | return taskIndex; |
@@ -288,4 +290,123 @@ private String extractRecordKey( |
288 | 290 |
|
289 | 291 | return String.join(",", recordKeyPairs); |
290 | 292 | } |
| 293 | + |
| 294 | + /** |
| 295 | + * Extract partition path from the DataChangeEvent based on schema partition keys. |
| 296 | + * |
| 297 | + * <p>If the schema has partition keys defined: |
| 298 | + * |
| 299 | + * <ul> |
| 300 | + * <li>Extracts partition field values from the record data |
| 301 | + * <li>Formats them as "field1=value1/field2=value2" (Hive-style partitioning) |
| 302 | + * </ul> |
| 303 | + * |
| 304 | + * <p>If no partition keys are defined, returns "default". |
| 305 | + * |
| 306 | + * @param event The DataChangeEvent to extract partition from |
| 307 | + * @param schema The table schema containing partition key definitions |
| 308 | + * @param fieldGetters Field getters for extracting values (not used currently, may be needed |
| 309 | + * for optimization) |
| 310 | + * @return The partition path string |
| 311 | + */ |
| 312 | + private String extractPartitionPath( |
| 313 | + DataChangeEvent event, Schema schema, List<RecordData.FieldGetter> fieldGetters) { |
| 314 | + |
| 315 | + // Check if schema has partition keys defined |
| 316 | + List<String> partitionKeys = schema.partitionKeys(); |
| 317 | + if (partitionKeys == null || partitionKeys.isEmpty()) { |
| 318 | + return "default"; |
| 319 | + } |
| 320 | + |
| 321 | + // Get the record data to extract from (after for INSERT/UPDATE/REPLACE, before for DELETE) |
| 322 | + RecordData recordData; |
| 323 | + switch (event.op()) { |
| 324 | + case INSERT: |
| 325 | + case UPDATE: |
| 326 | + case REPLACE: |
| 327 | + recordData = event.after(); |
| 328 | + break; |
| 329 | + case DELETE: |
| 330 | + recordData = event.before(); |
| 331 | + break; |
| 332 | + default: |
| 333 | + throw new IllegalArgumentException("Unsupported operation: " + event.op()); |
| 334 | + } |
| 335 | + |
| 336 | + if (recordData == null) { |
| 337 | + throw new IllegalStateException( |
| 338 | + "Cannot extract partition path: " + event.op() + " event has null data"); |
| 339 | + } |
| 340 | + |
| 341 | + // Extract partition values and build partition path |
| 342 | + List<String> partitionParts = new ArrayList<>(partitionKeys.size()); |
| 343 | + for (String partitionKey : partitionKeys) { |
| 344 | + int fieldIndex = schema.getColumnNames().indexOf(partitionKey); |
| 345 | + if (fieldIndex == -1) { |
| 346 | + throw new IllegalStateException( |
| 347 | + "Partition key field '" |
| 348 | + + partitionKey |
| 349 | + + "' not found in schema for table " |
| 350 | + + event.tableId()); |
| 351 | + } |
| 352 | + |
| 353 | + // Get field value |
| 354 | + Object fieldValue; |
| 355 | + if (recordData.isNullAt(fieldIndex)) { |
| 356 | + // Handle null partition values - use "__HIVE_DEFAULT_PARTITION__" as per Hive |
| 357 | + // convention |
| 358 | + fieldValue = "__HIVE_DEFAULT_PARTITION__"; |
| 359 | + } else { |
| 360 | + // Get the field value based on the field type |
| 361 | + DataType fieldType = schema.getColumns().get(fieldIndex).getType(); |
| 362 | + fieldValue = getFieldValue(recordData, fieldIndex, fieldType); |
| 363 | + } |
| 364 | + |
| 365 | + // Format as "key=value" (Hive-style partitioning) |
| 366 | + partitionParts.add(partitionKey + "=" + fieldValue); |
| 367 | + } |
| 368 | + |
| 369 | + // Join partition parts with "/" |
| 370 | + return String.join("/", partitionParts); |
| 371 | + } |
| 372 | + |
| 373 | + /** |
| 374 | + * Extract field value from RecordData based on field type. This is a simplified version - |
| 375 | + * complex types may need additional handling. |
| 376 | + */ |
| 377 | + private Object getFieldValue(RecordData recordData, int fieldIndex, DataType fieldType) { |
| 378 | + switch (fieldType.getTypeRoot()) { |
| 379 | + case CHAR: |
| 380 | + case VARCHAR: |
| 381 | + return recordData.getString(fieldIndex).toString(); |
| 382 | + case BOOLEAN: |
| 383 | + return recordData.getBoolean(fieldIndex); |
| 384 | + case TINYINT: |
| 385 | + return recordData.getByte(fieldIndex); |
| 386 | + case SMALLINT: |
| 387 | + return recordData.getShort(fieldIndex); |
| 388 | + case INTEGER: |
| 389 | + case DATE: |
| 390 | + return recordData.getInt(fieldIndex); |
| 391 | + case BIGINT: |
| 392 | + return recordData.getLong(fieldIndex); |
| 393 | + case FLOAT: |
| 394 | + return recordData.getFloat(fieldIndex); |
| 395 | + case DOUBLE: |
| 396 | + return recordData.getDouble(fieldIndex); |
| 397 | + case TIMESTAMP_WITHOUT_TIME_ZONE: |
| 398 | + return recordData.getTimestamp( |
| 399 | + fieldIndex, |
| 400 | + org.apache.flink.cdc.common.types.DataTypeChecks.getPrecision(fieldType)); |
| 401 | + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: |
| 402 | + return recordData.getLocalZonedTimestampData( |
| 403 | + fieldIndex, |
| 404 | + org.apache.flink.cdc.common.types.DataTypeChecks.getPrecision(fieldType)); |
| 405 | + default: |
| 406 | + // For other types, create a field getter and use it |
| 407 | + RecordData.FieldGetter fieldGetter = |
| 408 | + RecordData.createFieldGetter(fieldType, fieldIndex); |
| 409 | + return fieldGetter.getFieldOrNull(recordData); |
| 410 | + } |
| 411 | + } |
291 | 412 | } |
0 commit comments