diff --git a/Cargo.lock b/Cargo.lock index 65392512ce9..f1af838abc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,6 +36,20 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if 1.0.0", + "const-random", + "getrandom 0.3.1", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -51,6 +65,279 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "alloy" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e15860af634cad451f598712c24ca7fd9b45d84fff68ab8d4967567fa996c64" +dependencies = [ + "alloy-consensus", + "alloy-core", + "alloy-eips", + "alloy-serde", + "alloy-trie", +] + +[[package]] +name = "alloy-consensus" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6440213a22df93a87ed512d2f668e7dc1d62a05642d107f82d61edc9e12370" +dependencies = [ + "alloy-eips", + "alloy-primitives", + "alloy-rlp", + "alloy-serde", + "alloy-trie", + "alloy-tx-macros", + "auto_impl", + "c-kzg", + "derive_more 2.0.1", + "either", + "k256", + "once_cell", + "rand 0.8.5", + "secp256k1 0.30.0", + "serde", + "serde_json", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-core" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca96214615ec8cf3fa2a54b32f486eb49100ca7fe7eb0b8c1137cd316e7250a" +dependencies = [ + "alloy-json-abi", + "alloy-primitives", + "alloy-sol-types", +] + +[[package]] +name = "alloy-eip2124" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "741bdd7499908b3aa0b159bba11e71c8cddd009a2c2eb7a06e825f1ec87900a5" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "crc", + "serde", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-eip2930" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9441120fa82df73e8959ae0e4ab8ade03de2aaae61be313fbf5746277847ce25" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "serde", +] + +[[package]] +name = "alloy-eip7702" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2919c5a56a1007492da313e7a3b6d45ef5edc5d33416fdec63c0d7a2702a0d20" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "serde", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-eips" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd2c7ae05abcab4483ce821f12f285e01c0b33804e6883dd9ca1569a87ee2be" +dependencies = [ + "alloy-eip2124", + "alloy-eip2930", + "alloy-eip7702", + "alloy-primitives", + "alloy-rlp", + "alloy-serde", + "auto_impl", + "c-kzg", + "derive_more 2.0.1", + "either", + "serde", + "serde_with", + "sha2", + "thiserror 2.0.16", +] + +[[package]] +name = "alloy-json-abi" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5513d5e6bd1cba6bdcf5373470f559f320c05c8c59493b6e98912fbe6733943f" +dependencies = [ + "alloy-primitives", + "alloy-sol-type-parser", + "serde", + "serde_json", +] + +[[package]] +name = "alloy-primitives" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355bf68a433e0fd7f7d33d5a9fc2583fde70bf5c530f63b80845f8da5505cf28" +dependencies = [ + "alloy-rlp", + "bytes", + "cfg-if 1.0.0", + "const-hex", + "derive_more 2.0.1", + "hashbrown 0.16.1", + "indexmap 2.11.4", + "itoa", + "paste", + "rand 0.9.2", + "ruint", + "serde", + "tiny-keccak 2.0.2", +] + +[[package]] +name = "alloy-rlp" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f70d83b765fdc080dbcd4f4db70d8d23fe4761f2f02ebfa9146b833900634b4" +dependencies = [ + "alloy-rlp-derive", + "arrayvec 0.7.4", + "bytes", +] + +[[package]] +name = "alloy-rlp-derive" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64b728d511962dda67c1bc7ea7c03736ec275ed2cf4c35d9585298ac9ccf3b73" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "alloy-serde" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6f180c399ca7c1e2fe17ea58343910cad0090878a696ff5a50241aee12fc529" +dependencies = [ + "alloy-primitives", + "serde", + "serde_json", +] + +[[package]] +name = "alloy-sol-macro" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ce480400051b5217f19d6e9a82d9010cdde20f1ae9c00d53591e4a1afbb312" +dependencies = [ + "alloy-sol-macro-expander", + "alloy-sol-macro-input", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "alloy-sol-macro-expander" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d792e205ed3b72f795a8044c52877d2e6b6e9b1d13f431478121d8d4eaa9028" +dependencies = [ + "alloy-json-abi", + "alloy-sol-macro-input", + "const-hex", + "heck 0.5.0", + "indexmap 2.11.4", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.106", + "syn-solidity", + "tiny-keccak 2.0.2", +] + +[[package]] +name = "alloy-sol-macro-input" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bd1247a8f90b465ef3f1207627547ec16940c35597875cdc09c49d58b19693c" +dependencies = [ + "alloy-json-abi", + "const-hex", + "dunce", + "heck 0.5.0", + "macro-string", + "proc-macro2", + "quote", + "serde_json", + "syn 2.0.106", + "syn-solidity", +] + +[[package]] +name = "alloy-sol-type-parser" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "954d1b2533b9b2c7959652df3076954ecb1122a28cc740aa84e7b0a49f6ac0a9" +dependencies = [ + "serde", + "winnow 0.7.13", +] + +[[package]] +name = "alloy-sol-types" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70319350969a3af119da6fb3e9bddb1bce66c9ea933600cb297c8b1850ad2a3c" +dependencies = [ + "alloy-json-abi", + "alloy-primitives", + "alloy-sol-macro", +] + +[[package]] +name = "alloy-trie" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3412d52bb97c6c6cc27ccc28d4e6e8cf605469101193b50b0bd5813b1f990b5" +dependencies = [ + "alloy-primitives", + "alloy-rlp", + "arrayvec 0.7.4", + "derive_more 2.0.1", + "nybbles", + "serde", + "smallvec", + "tracing", +] + +[[package]] +name = "alloy-tx-macros" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae109e33814b49fc0a62f2528993aa8a2dd346c26959b151f05441dc0b9da292" +dependencies = [ + "darling 0.21.3", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "android_system_properties" version = "0.1.5" @@ -144,6 +431,243 @@ name = "arrayvec" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +dependencies = [ + "serde", +] + +[[package]] +name = "arrow" +version = "55.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3095aaf545942ff5abd46654534f15b03a90fba78299d661e045e5d587222f0d" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.15.2", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64 0.22.1", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-flight" +version = "55.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e0fad280f41a918d53ba48288a246ff04202d463b3b380fbc0edecdcb52cfd" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", + "base64 0.22.1", + "bytes", + "futures 0.3.31", + "once_cell", + "paste", + "prost", + "prost-types", + "tonic", +] + +[[package]] +name = "arrow-ipc" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.11.4", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" + +[[package]] +name = "arrow-select" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] [[package]] name = "ascii_utils" @@ -208,7 +732,7 @@ dependencies = [ "serde_json", "tokio", "tokio-stream", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower-service 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -220,7 +744,7 @@ checksum = "fd45deb3dbe5da5cdb8d6a670a7736d735ba65b455328440f236dfb113727a3d" dependencies = [ "Inflector", "async-graphql-parser", - "darling", + "darling 0.20.10", "proc-macro-crate", "proc-macro2", "quote", @@ -297,6 +821,15 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -320,6 +853,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "auto_impl" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdcb70bdbc4d478427380519163274ac86e52916e10f0a8889adf0f96d3fee7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "autocfg" version = "1.3.0" @@ -460,6 +1004,12 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base64" version = "0.13.1" @@ -478,6 +1028,12 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64ct" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" + [[package]] name = "beef" version = "0.5.2" @@ -510,6 +1066,22 @@ dependencies = [ "num-traits", ] +[[package]] +name = "bitcoin-io" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dee39a0ee5b4095224a0cfc6bf4cc1baf0f9624b96b367e53b66d974e51d953" + +[[package]] +name = "bitcoin_hashes" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb18c03d0db0247e147a21a6faafd5a7eb851c743db062de72018b6b7e8e4d16" +dependencies = [ + "bitcoin-io", + "hex-conservative", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -577,7 +1149,19 @@ version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ - "generic-array", + "generic-array", +] + +[[package]] +name = "blst" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcdb4c7013139a150f9fc55d123186dbfaba0d912817466282c73ac49e71fb45" +dependencies = [ + "cc", + "glob", + "threadpool", + "zeroize", ] [[package]] @@ -635,6 +1219,21 @@ dependencies = [ "serde", ] +[[package]] +name = "c-kzg" +version = "2.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e00bf4b112b07b505472dbefd19e37e53307e2bfed5a79e0cc161d58ccd0e687" +dependencies = [ + "blst", + "cc", + "glob", + "hex", + "libc", + "once_cell", + "serde", +] + [[package]] name = "cc" version = "1.2.43" @@ -749,7 +1348,7 @@ dependencies = [ "memchr", "pin-project-lite", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", ] [[package]] @@ -765,6 +1364,44 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "const-hex" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bb320cac8a0750d7f25280aa97b09c26edfe161164238ecbbb31092b079e735" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "proptest", + "serde_core", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.15", + "once_cell", + "tiny-keccak 2.0.2", +] + [[package]] name = "constant_time_eq" version = "0.1.5" @@ -981,6 +1618,21 @@ version = "0.120.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e3f4d783a55c64266d17dc67d2708852235732a100fc40dd9f1051adc64d7b" +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.4.2" @@ -1052,6 +1704,18 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + [[package]] name = "crypto-common" version = "0.1.6" @@ -1099,8 +1763,18 @@ version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.20.10", + "darling_macro 0.20.10", +] + +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", ] [[package]] @@ -1117,13 +1791,39 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "serde", + "strsim", + "syn 2.0.106", +] + [[package]] name = "darling_macro" version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ - "darling_core", + "darling_core 0.20.10", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", "quote", "syn 2.0.106", ] @@ -1186,6 +1886,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "930c7171c8df9fb1782bdf9b918ed9ed2d33d1d22300abb754f9085bc48bf8e8" +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "deranged" version = "0.3.11" @@ -1344,6 +2054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer 0.10.4", + "const-oid", "crypto-common", "subtle", ] @@ -1417,7 +2128,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" dependencies = [ - "darling", + "darling 0.20.10", "either", "heck 0.5.0", "proc-macro2", @@ -1425,11 +2136,54 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der", + "digest 0.10.7", + "elliptic-curve", + "rfc6979", + "serdect", + "signature", +] + [[package]] name = "either" -version = "1.13.0" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +dependencies = [ + "serde", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct", + "crypto-bigint", + "digest 0.10.7", + "ff", + "generic-array", + "group", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "serdect", + "subtle", + "zeroize", +] [[package]] name = "embedded-io" @@ -1588,6 +2342,16 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "find-msvc-tools" version = "0.1.4" @@ -1624,6 +2388,16 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flatbuffers" +version = "25.9.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09b6620799e7340ebd9968d2e0708eb82cf1971e9a16821e2091b6d6e475eed5" +dependencies = [ + "bitflags 2.9.0", + "rustc_version", +] + [[package]] name = "flate2" version = "1.0.30" @@ -1646,6 +2420,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.3.2" @@ -1817,6 +2597,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", + "zeroize", ] [[package]] @@ -1881,6 +2662,12 @@ dependencies = [ "time", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "globset" version = "0.4.16" @@ -1913,6 +2700,7 @@ dependencies = [ "pq-sys", "serde", "tokio", + "tokio-util 0.7.17", ] [[package]] @@ -1920,7 +2708,11 @@ name = "graph" version = "0.36.0" dependencies = [ "Inflector", + "ahash", + "alloy", "anyhow", + "arrow", + "arrow-flight", "async-stream", "async-trait", "atomic_refcell", @@ -1943,6 +2735,7 @@ dependencies = [ "futures 0.3.31", "graph_derive", "graphql-parser", + "half", "hex", "hex-literal 1.0.0", "http 0.2.12", @@ -1952,6 +2745,7 @@ dependencies = [ "hyper 1.7.0", "hyper-util", "itertools", + "lazy-regex", "lazy_static", "lru_time_cache", "maplit", @@ -1981,7 +2775,8 @@ dependencies = [ "slog-async", "slog-envlogger", "slog-term", - "sqlparser", + "sqlparser 0.57.0", + "sqlparser 0.59.0", "stable-hash 0.3.4", "stable-hash 0.4.4", "strum_macros 0.27.2", @@ -1990,6 +2785,7 @@ dependencies = [ "tokio", "tokio-retry", "tokio-stream", + "tokio-util 0.7.17", "toml 0.9.7", "tonic", "tonic-build", @@ -2069,18 +2865,30 @@ dependencies = [ name = "graph-core" version = "0.36.0" dependencies = [ + "alloy", "anyhow", + "arrow", "async-trait", "atomic_refcell", "bytes", + "chrono", "cid", + "futures 0.3.31", "graph", "graph-chain-ethereum", "graph-chain-near", "graph-chain-substreams", "graph-runtime-wasm", + "indoc", + "itertools", + "parking_lot", + "prometheus", "serde_yaml", + "slog", + "strum", "thiserror 2.0.16", + "tokio", + "tokio-util 0.7.17", "tower 0.5.2 (git+https://github.com/tower-rs/tower.git)", "tower-test", "wiremock", @@ -2132,6 +2940,7 @@ dependencies = [ "serde", "shellexpand", "termcolor", + "tokio-util 0.7.17", "url", ] @@ -2250,7 +3059,7 @@ dependencies = [ "rand 0.9.2", "serde", "serde_json", - "sqlparser", + "sqlparser 0.59.0", "stable-hash 0.3.4", "thiserror 2.0.16", ] @@ -2271,12 +3080,13 @@ dependencies = [ "graph-runtime-wasm", "graph-server-index-node", "graph-store-postgres", - "secp256k1", + "secp256k1 0.21.3", "serde", "serde_yaml", "slog", "tokio", "tokio-stream", + "tokio-util 0.7.17", ] [[package]] @@ -2362,6 +3172,17 @@ dependencies = [ "serde_with", ] +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "h2" version = "0.3.26" @@ -2377,7 +3198,7 @@ dependencies = [ "indexmap 2.11.4", "slab", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tracing", ] @@ -2396,10 +3217,22 @@ dependencies = [ "indexmap 2.11.4", "slab", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if 1.0.0", + "crunchy", + "num-traits", + "zerocopy", +] + [[package]] name = "handlebars" version = "5.1.2" @@ -2428,8 +3261,19 @@ checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", + "serde", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "foldhash 0.2.0", "serde", + "serde_core", ] [[package]] @@ -2499,6 +3343,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hex-conservative" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fda06d18ac606267c40c04e41b9947729bf8b9efe74bd4e82b61a5f26a510b9f" +dependencies = [ + "arrayvec 0.7.4", +] + [[package]] name = "hex-literal" version = "0.3.4" @@ -2981,6 +3834,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + [[package]] name = "inotify" version = "0.11.0" @@ -3221,52 +4083,145 @@ dependencies = [ "tracing", ] +[[package]] +name = "k256" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b" +dependencies = [ + "cfg-if 1.0.0", + "ecdsa", + "elliptic-curve", + "serdect", + "sha2", +] + [[package]] name = "keccak" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654" +checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654" +dependencies = [ + "cpufeatures", +] + +[[package]] +name = "kqueue" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac30106d7dce88daf4a3fcb4879ea939476d5074a9b7ddd0fb97fa4bed5596a" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + +[[package]] +name = "lazy-regex" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "191898e17ddee19e60bccb3945aa02339e81edd4a8c50e21fd4d48cdecda7b29" +dependencies = [ + "lazy-regex-proc_macros", + "once_cell", + "regex", +] + +[[package]] +name = "lazy-regex-proc_macros" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c35dc8b0da83d1a9507e12122c80dea71a9c7c613014347392483a83ea593e04" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "syn 2.0.106", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "leb128" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ - "cpufeatures", + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", ] [[package]] -name = "kqueue" -version = "1.1.1" +name = "lexical-parse-float" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac30106d7dce88daf4a3fcb4879ea939476d5074a9b7ddd0fb97fa4bed5596a" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" dependencies = [ - "kqueue-sys", - "libc", + "lexical-parse-integer", + "lexical-util", ] [[package]] -name = "kqueue-sys" -version = "1.0.4" +name = "lexical-parse-integer" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ - "bitflags 1.3.2", - "libc", + "lexical-util", ] [[package]] -name = "lazy_static" -version = "1.5.0" +name = "lexical-util" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" [[package]] -name = "leb128" -version = "0.2.5" +name = "lexical-write-float" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] [[package]] -name = "leb128fmt" -version = "0.1.0" +name = "lexical-write-integer" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] [[package]] name = "libc" @@ -3339,6 +4294,17 @@ dependencies = [ "libc", ] +[[package]] +name = "macro-string" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b27834086c65ec3f9387b096d66e99f221cf081c2b738042aa252bcd41204e3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "maplit" version = "1.0.2" @@ -3543,6 +4509,20 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d" +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint 0.4.6", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.2.6" @@ -3565,6 +4545,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.1.0" @@ -3580,6 +4569,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint 0.4.6", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -3587,6 +4598,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -3599,6 +4611,18 @@ dependencies = [ "libc", ] +[[package]] +name = "nybbles" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4b5ecbd0beec843101bffe848217f770e8b8da81d8355b7d6e226f2199b3dc" +dependencies = [ + "cfg-if 1.0.0", + "ruint", + "serde", + "smallvec", +] + [[package]] name = "object" version = "0.36.7" @@ -3649,9 +4673,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "opaque-debug" @@ -3783,6 +4807,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -3917,6 +4947,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.30" @@ -4090,6 +5130,28 @@ dependencies = [ "toml_edit 0.21.1", ] +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "proc-macro-utils" version = "0.10.0" @@ -4127,6 +5189,20 @@ dependencies = [ "thiserror 2.0.16", ] +[[package]] +name = "proptest" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee689443a2bd0a16ab0348b52ee43e3b2d1b1f931c8aa5c9f8de4c86fbe8c40" +dependencies = [ + "bitflags 2.9.0", + "num-traits", + "rand 0.9.2", + "rand_chacha 0.9.0", + "rand_xorshift", + "unarray", +] + [[package]] name = "prost" version = "0.13.5" @@ -4327,6 +5403,7 @@ dependencies = [ "libc", "rand_chacha 0.3.1", "rand_core 0.6.4", + "serde", ] [[package]] @@ -4337,6 +5414,7 @@ checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", + "serde", ] [[package]] @@ -4375,6 +5453,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ "getrandom 0.3.1", + "serde", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core 0.9.3", ] [[package]] @@ -4438,7 +5526,7 @@ dependencies = [ "sha1_smol", "socket2 0.5.7", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "url", ] @@ -4487,9 +5575,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.5" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -4499,9 +5587,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -4510,9 +5598,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "reqwest" @@ -4552,7 +5640,7 @@ dependencies = [ "tokio", "tokio-native-tls", "tokio-rustls", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "tower-http", "tower-service 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", @@ -4563,6 +5651,16 @@ dependencies = [ "web-sys", ] +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + [[package]] name = "ring" version = "0.17.13" @@ -4587,6 +5685,28 @@ dependencies = [ "rustc-hex", ] +[[package]] +name = "ruint" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68df0380e5c9d20ce49534f292a36a7514ae21350726efe1865bdb1fa91d278" +dependencies = [ + "alloy-rlp", + "proptest", + "rand 0.8.5", + "rand 0.9.2", + "ruint-macro", + "serde_core", + "valuable", + "zeroize", +] + +[[package]] +name = "ruint-macro" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48fd7bd8a6377e15ad9d42a8ec25371b94ddc67abe7c8b9127bec79bebaaae18" + [[package]] name = "rustc-demangle" version = "0.1.24" @@ -4758,13 +5878,40 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "serdect", + "subtle", + "zeroize", +] + [[package]] name = "secp256k1" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c42e6f1735c5f00f51e43e28d6634141f2bcad10931b2609ddd74a86d751260" dependencies = [ - "secp256k1-sys", + "secp256k1-sys 0.4.2", +] + +[[package]] +name = "secp256k1" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50c5943d326858130af85e049f2661ba3c78b26589b8ab98e65e80ae44a1252" +dependencies = [ + "bitcoin_hashes", + "rand 0.8.5", + "secp256k1-sys 0.10.1", + "serde", ] [[package]] @@ -4776,6 +5923,15 @@ dependencies = [ "cc", ] +[[package]] +name = "secp256k1-sys" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4387882333d3aa8cb20530a17c69a3752e97837832f34f6dccc760e715001d9" +dependencies = [ + "cc", +] + [[package]] name = "security-framework" version = "2.11.0" @@ -4946,7 +6102,7 @@ version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" dependencies = [ - "darling", + "darling 0.20.10", "proc-macro2", "quote", "syn 2.0.106", @@ -4965,6 +6121,16 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "serdect" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a84f14a19e9a014bb9f4512488d9829a68e04ecabffb0f9904cd1ace94598177" +dependencies = [ + "base16ct", + "serde", +] + [[package]] name = "sha-1" version = "0.9.8" @@ -5040,6 +6206,22 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest 0.10.7", + "rand_core 0.6.4", +] + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "0.3.11" @@ -5173,12 +6355,33 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + [[package]] name = "sptr" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b9b39299b249ad65f3b7e96443bad61c02ca5cd3589f46cb6d610a0fd6c0d6a" +[[package]] +name = "sqlparser" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07c5f081b292a3d19637f0b32a79e28ff14a9fd23ef47bd7fce08ff5de221eca" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + [[package]] name = "sqlparser" version = "0.59.0" @@ -5417,6 +6620,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn-solidity" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff790eb176cc81bb8936aed0f7b9f14fc4670069a2d371b3e3b0ecce908b2cb3" +dependencies = [ + "paste", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "sync_wrapper" version = "0.1.2" @@ -5591,6 +6806,15 @@ dependencies = [ "once_cell", ] +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + [[package]] name = "time" version = "0.3.36" @@ -5740,7 +6964,7 @@ dependencies = [ "rand 0.9.2", "socket2 0.5.7", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "whoami", ] @@ -5775,7 +6999,7 @@ dependencies = [ "futures-core", "pin-project-lite", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", ] [[package]] @@ -5820,9 +7044,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.11" +version = "0.7.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" dependencies = [ "bytes", "futures-core", @@ -5978,7 +7202,7 @@ dependencies = [ "rand 0.8.5", "slab", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower-layer 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "tower-service 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "tracing", @@ -6013,7 +7237,7 @@ dependencies = [ "slab", "sync_wrapper 1.0.1", "tokio", - "tokio-util 0.7.11", + "tokio-util 0.7.17", "tower-layer 0.3.3 (git+https://github.com/tower-rs/tower.git)", "tower-service 0.3.3 (git+https://github.com/tower-rs/tower.git)", "tracing", @@ -6190,6 +7414,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicase" version = "2.7.0" @@ -6316,6 +7546,12 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "vcpkg" version = "0.2.15" @@ -6844,7 +8080,7 @@ dependencies = [ "pin-project", "reqwest", "rlp", - "secp256k1", + "secp256k1 0.21.3", "serde", "serde_json", "soketto", @@ -7344,6 +8580,26 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "zerofrom" version = "0.1.6" @@ -7370,6 +8626,20 @@ name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] [[package]] name = "zerovec" diff --git a/Cargo.toml b/Cargo.toml index c7c25b817a5..7cd023fcccb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,20 @@ substreams = "=0.6.0" substreams-entity-change = "2" substreams-near-core = "=0.10.2" rand = { version = "0.9.2", features = ["os_rng"] } +prometheus = "0.14.0" + +# Dependencies related to Amp subgraphs +ahash = "0.8.11" +alloy = { version = "1.0.12", default-features = false, features = ["json-abi", "serde"] } +arrow = { version = "=55.0.0" } +arrow-flight = { version = "=55.0.0", features = ["flight-sql-experimental"] } +futures = "0.3.31" +half = "2.7.1" +indoc = "2.0.7" +lazy-regex = "3.4.1" +parking_lot = "0.12.4" +sqlparser-latest = { version = "0.57.0", package = "sqlparser", features = ["visitor"] } +tokio-util = "0.7.15" # Incremental compilation on Rust 1.58 causes an ICE on build. As soon as graph node builds again, these can be removed. [profile.test] diff --git a/chain/ethereum/src/runtime/runtime_adapter.rs b/chain/ethereum/src/runtime/runtime_adapter.rs index 8b11ada37cc..acbf41c62a3 100644 --- a/chain/ethereum/src/runtime/runtime_adapter.rs +++ b/chain/ethereum/src/runtime/runtime_adapter.rs @@ -182,6 +182,7 @@ impl blockchain::RuntimeAdapter for RuntimeAdapter { create_host_fns(abis, archive, call_cache, eth_adapters, eth_call_gas) } data_source::DataSource::Offchain(_) => vec![], + data_source::DataSource::Amp(_) => vec![], }; Ok(host_fns) diff --git a/core/Cargo.toml b/core/Cargo.toml index 0a5440b2b30..28afc1079c6 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -19,6 +19,20 @@ thiserror = { workspace = true } cid = "0.11.1" anyhow = "1.0" +# Dependencies related to Amp subgraphs +alloy.workspace = true +arrow.workspace = true +chrono.workspace = true +futures.workspace = true +indoc.workspace = true +itertools.workspace = true +parking_lot.workspace = true +prometheus.workspace = true +slog.workspace = true +strum.workspace = true +tokio-util.workspace = true +tokio.workspace = true + [dev-dependencies] tower-test = { git = "https://github.com/tower-rs/tower.git" } wiremock = "0.6.5" diff --git a/core/src/amp_subgraph/manager.rs b/core/src/amp_subgraph/manager.rs new file mode 100644 index 00000000000..041a30df226 --- /dev/null +++ b/core/src/amp_subgraph/manager.rs @@ -0,0 +1,171 @@ +use std::sync::Arc; + +use alloy::primitives::BlockNumber; +use anyhow::Context; +use async_trait::async_trait; +use graph::{ + amp, + components::{ + link_resolver::{LinkResolver, LinkResolverContext}, + metrics::MetricsRegistry, + store::{DeploymentLocator, SubgraphStore}, + subgraph::SubgraphInstanceManager, + }, + env::EnvVars, + log::factory::LoggerFactory, + prelude::CheapClone, +}; +use slog::{debug, error}; +use tokio_util::sync::CancellationToken; + +use super::{runner, Metrics, Monitor}; + +/// Manages Amp subgraph runner futures. +/// +/// Creates and schedules Amp subgraph runner futures for execution on demand. +/// Also handles stopping previously started Amp subgraph runners. +pub struct Manager { + logger_factory: LoggerFactory, + metrics_registry: Arc, + env_vars: Arc, + monitor: Monitor, + subgraph_store: Arc, + link_resolver: Arc, + amp_client: Arc, +} + +impl Manager +where + SS: SubgraphStore, + NC: amp::Client, +{ + /// Creates a new Amp subgraph manager. + pub fn new( + logger_factory: &LoggerFactory, + metrics_registry: Arc, + env_vars: Arc, + cancel_token: &CancellationToken, + subgraph_store: Arc, + link_resolver: Arc, + amp_client: Arc, + ) -> Self { + let logger = logger_factory.component_logger("AmpSubgraphManager", None); + let logger_factory = logger_factory.with_parent(logger); + + let monitor = Monitor::new(&logger_factory, cancel_token); + + Self { + logger_factory, + metrics_registry, + env_vars, + monitor, + subgraph_store, + link_resolver, + amp_client, + } + } +} + +#[async_trait] +impl SubgraphInstanceManager for Manager +where + SS: SubgraphStore, + NC: amp::Client + Send + Sync + 'static, +{ + async fn start_subgraph( + self: Arc, + deployment: DeploymentLocator, + stop_block: Option, + ) { + let manager = self.cheap_clone(); + + self.monitor.start( + deployment.cheap_clone(), + Box::new(move |cancel_token| { + Box::pin(async move { + let logger = manager.logger_factory.subgraph_logger(&deployment); + + let store = manager + .subgraph_store + .cheap_clone() + .writable(logger.cheap_clone(), deployment.id, Vec::new().into()) + .await + .context("failed to create writable store")?; + + let metrics = Metrics::new( + &logger, + manager.metrics_registry.cheap_clone(), + store.cheap_clone(), + deployment.hash.cheap_clone(), + ); + + let link_resolver = manager + .link_resolver + .for_manifest(&deployment.hash.to_string()) + .context("failed to create link resolver")?; + + let manifest_bytes = link_resolver + .cat( + &LinkResolverContext::new(&deployment.hash, &logger), + &deployment.hash.to_ipfs_link(), + ) + .await + .context("failed to load subgraph manifest")?; + + let raw_manifest = serde_yaml::from_slice(&manifest_bytes) + .context("failed to parse subgraph manifest")?; + + let mut manifest = amp::Manifest::resolve::( + &logger, + manager.link_resolver.cheap_clone(), + manager.amp_client.cheap_clone(), + manager.env_vars.max_spec_version.cheap_clone(), + deployment.hash.cheap_clone(), + raw_manifest, + ) + .await?; + + if let Some(stop_block) = stop_block { + for data_source in manifest.data_sources.iter_mut() { + data_source.source.end_block = stop_block as BlockNumber; + } + } + + store + .start_subgraph_deployment(&logger) + .await + .context("failed to start subgraph deployment")?; + + let runner_context = runner::Context::new( + &logger, + &manager.env_vars.amp, + manager.amp_client.cheap_clone(), + store, + deployment.hash.cheap_clone(), + manifest, + metrics, + ); + + let runner_result = runner::new_runner(runner_context)(cancel_token).await; + + match manager.subgraph_store.stop_subgraph(&deployment).await { + Ok(()) => { + debug!(logger, "Subgraph writer stopped"); + } + Err(e) => { + error!(logger, "Failed to stop subgraph writer"; + "e" => ?e + ); + } + } + + runner_result + }) + }), + ); + } + + async fn stop_subgraph(&self, deployment: DeploymentLocator) { + self.monitor.stop(deployment); + } +} diff --git a/core/src/amp_subgraph/metrics.rs b/core/src/amp_subgraph/metrics.rs new file mode 100644 index 00000000000..1e74a4bcb9a --- /dev/null +++ b/core/src/amp_subgraph/metrics.rs @@ -0,0 +1,260 @@ +use std::{sync::Arc, time::Duration}; + +use alloy::primitives::BlockNumber; +use graph::{ + cheap_clone::CheapClone, + components::{ + metrics::{stopwatch::StopwatchMetrics, MetricsRegistry}, + store::WritableStore, + }, + prelude::DeploymentHash, +}; +use indoc::indoc; +use prometheus::{IntCounter, IntGauge}; +use slog::Logger; + +/// Contains metrics specific to a deployment. +pub(super) struct Metrics { + pub(super) deployment_status: DeploymentStatus, + pub(super) deployment_head: DeploymentHead, + pub(super) deployment_target: DeploymentTarget, + pub(super) deployment_synced: DeploymentSynced, + pub(super) indexing_duration: IndexingDuration, + pub(super) blocks_processed: BlocksProcessed, + pub(super) stopwatch: StopwatchMetrics, +} + +impl Metrics { + /// Creates new deployment specific metrics. + pub(super) fn new( + logger: &Logger, + metrics_registry: Arc, + store: Arc, + deployment: DeploymentHash, + ) -> Self { + let stopwatch = StopwatchMetrics::new( + logger.cheap_clone(), + deployment.cheap_clone(), + "amp-process", + metrics_registry.cheap_clone(), + store.shard().to_string(), + ); + + let const_labels = [("deployment", &deployment)]; + + Self { + deployment_status: DeploymentStatus::new(&metrics_registry, const_labels.clone()), + deployment_head: DeploymentHead::new(&metrics_registry, const_labels.clone()), + deployment_target: DeploymentTarget::new(&metrics_registry, const_labels.clone()), + deployment_synced: DeploymentSynced::new(&metrics_registry, const_labels.clone()), + indexing_duration: IndexingDuration::new(&metrics_registry, const_labels.clone()), + blocks_processed: BlocksProcessed::new(&metrics_registry, const_labels.clone()), + stopwatch, + } + } +} + +/// Reports the current indexing status of a deployment. +pub(super) struct DeploymentStatus(IntGauge); + +impl DeploymentStatus { + const STATUS_STARTING: i64 = 1; + const STATUS_RUNNING: i64 = 2; + const STATUS_STOPPED: i64 = 3; + const STATUS_FAILED: i64 = 4; + + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_status", + indoc!( + " + Indicates the current indexing status of a deployment. + Possible values: + 1 - graph-node is preparing to start indexing; + 2 - deployment is being indexed; + 3 - indexing is stopped by request; + 4 - indexing failed; + " + ), + const_labels, + ) + .expect("failed to register `amp_deployment_status` gauge"); + + Self(int_gauge) + } + + /// Records that the graph-node is preparing to start indexing. + pub fn starting(&self) { + self.0.set(Self::STATUS_STARTING); + } + + /// Records that the deployment is being indexed. + pub fn running(&self) { + self.0.set(Self::STATUS_RUNNING); + } + + /// Records that the indexing stopped by request. + pub fn stopped(&self) { + self.0.set(Self::STATUS_STOPPED); + } + + /// Records that the indexing failed. + pub fn failed(&self) { + self.0.set(Self::STATUS_FAILED); + } +} + +/// Tracks the most recent block number processed by a deployment. +pub(super) struct DeploymentHead(IntGauge); + +impl DeploymentHead { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_head", + "Tracks the most recent block number processed by a deployment", + const_labels, + ) + .expect("failed to register `amp_deployment_head` gauge"); + + Self(int_gauge) + } + + /// Updates the most recent block number processed by this deployment. + pub(super) fn update(&self, new_most_recent_block_number: BlockNumber) { + self.0.set( + i64::try_from(new_most_recent_block_number) + .expect("new most recent block number does not fit into `i64`"), + ); + } +} + +/// Tracks the target block number of a deployment. +pub(super) struct DeploymentTarget(IntGauge); + +impl DeploymentTarget { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_target", + "Tracks the target block number of a deployment", + const_labels, + ) + .expect("failed to register `amp_deployment_target` gauge"); + + Self(int_gauge) + } + + /// Updates the target block number of this deployment. + pub(super) fn update(&self, new_target_block_number: BlockNumber) { + self.0.set( + i64::try_from(new_target_block_number) + .expect("new target block number does not fit into `i64`"), + ); + } +} + +/// Indicates whether a deployment has reached the chain head or the end block since it was deployed. +pub(super) struct DeploymentSynced(IntGauge); + +impl DeploymentSynced { + const NOT_SYNCED: i64 = 0; + const SYNCED: i64 = 1; + + pub fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_gauge = metrics_registry + .new_int_gauge( + "amp_deployment_synced", + indoc!( + " + Indicates whether a deployment has reached the chain head or the end block since it was deployed. + Possible values: + 0 - deployment is not synced; + 1 - deployment is synced; + " + ), + const_labels, + ) + .expect("failed to register `amp_deployment_synced` gauge"); + + Self(int_gauge) + } + + /// Records the current sync status of this deployment. + pub fn record(&self, synced: bool) { + self.0.set(if synced { + Self::SYNCED + } else { + Self::NOT_SYNCED + }); + } +} + +/// Tracks the total duration in seconds of deployment indexing. +#[derive(Clone)] +pub(super) struct IndexingDuration(IntCounter); + +impl IndexingDuration { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_counter = metrics_registry + .new_int_counter( + "amp_deployment_indexing_duration_seconds", + "Tracks the total duration in seconds of deployment indexing", + const_labels, + ) + .expect("failed to register `amp_deployment_indexing_duration_seconds` counter"); + + Self(int_counter) + } + + /// Records a new indexing duration of this deployment. + pub(super) fn record(&self, duration: Duration) { + self.0.inc_by(duration.as_secs()) + } +} + +/// Tracks the total number of blocks processed by a deployment. +pub(super) struct BlocksProcessed(IntCounter); + +impl BlocksProcessed { + fn new( + metrics_registry: &MetricsRegistry, + const_labels: impl IntoIterator, + ) -> Self { + let int_counter = metrics_registry + .new_int_counter( + "amp_deployment_blocks_processed_count", + "Tracks the total number of blocks processed by a deployment", + const_labels, + ) + .expect("failed to register `amp_deployment_blocks_processed_count` counter"); + + Self(int_counter) + } + + /// Records a new processed block. + pub(super) fn record_one(&self) { + self.record(1); + } + + /// Records the new processed blocks. + pub(super) fn record(&self, number_of_blocks_processed: usize) { + self.0.inc_by(number_of_blocks_processed as u64); + } +} diff --git a/core/src/amp_subgraph/mod.rs b/core/src/amp_subgraph/mod.rs new file mode 100644 index 00000000000..3d3846742aa --- /dev/null +++ b/core/src/amp_subgraph/mod.rs @@ -0,0 +1,8 @@ +mod manager; +mod metrics; +mod monitor; +mod runner; + +use self::{metrics::Metrics, monitor::Monitor}; + +pub use self::manager::Manager; diff --git a/core/src/amp_subgraph/monitor.rs b/core/src/amp_subgraph/monitor.rs new file mode 100644 index 00000000000..cfa1de2942d --- /dev/null +++ b/core/src/amp_subgraph/monitor.rs @@ -0,0 +1,573 @@ +//! This module is responsible for executing subgraph runner futures. +//! +//! # Terminology used in this module +//! +//! `active subgraph` - A subgraph that was started and is still tracked. +//! `running subgraph` - A subgraph that has an instance that is making progress or stopping. +//! `subgraph instance` - A background process that executes the subgraph runner future. + +use std::{ + collections::{hash_map::Entry, HashMap}, + fmt, + sync::{ + atomic::{AtomicU32, Ordering::SeqCst}, + Arc, + }, + time::Duration, +}; + +use anyhow::Result; +use futures::future::BoxFuture; +use graph::{ + cheap_clone::CheapClone, components::store::DeploymentLocator, log::factory::LoggerFactory, +}; +use slog::{debug, error, info, warn, Logger}; +use tokio::{sync::mpsc, task::JoinHandle, time::timeout}; +use tokio_util::sync::CancellationToken; + +/// Represents the maximum amount of time a subgraph instance is allowed to run +/// after it receives a cancel signal. +/// +/// If a subgraph instance does not complete its execution in this amount of time +/// it is considered unresponsive and is aborted. +const SUBGRAPH_INSTANCE_GRACE_PERIOD: Duration = { + if cfg!(test) { + Duration::from_millis(300) + } else if cfg!(debug_assertions) { + Duration::from_secs(30) + } else { + Duration::from_secs(300) + } +}; + +/// Represents the subgraph runner future. +/// +/// This is the future that performs the subgraph indexing. +/// It is expected to return only on deterministic failures or when indexing is completed. +/// All retry functionality must be handled internally by this future. +pub(super) type BoxRunner = + Box BoxFuture<'static, Result<()>> + Send + 'static>; + +/// Manages the lifecycle of subgraph runners. +/// +/// Ensures that there is at most one subgraph instance running +/// for any subgraph deployment at any point in time. +/// Handles starting, stopping and restarting subgraphs. +pub(super) struct Monitor { + logger_factory: Arc, + + /// Every subgraph instance is assigned a cancel token derived from this token. + /// + /// This means that the `Monitor` can send cancel signals to all subgraph instances at once, + /// and to each subgraph instance individually. + cancel_token: CancellationToken, + + /// The channel that is used to send subgraph commands. + /// + /// Every subgraph start and stop request results in a command that is sent to the + /// background process that manages the subgraph instances. + command_tx: mpsc::UnboundedSender, + + /// When a subgraph starts it is assigned a sequential ID. + /// The ID is then kept in memory in the list of active subgraphs. + /// + /// When the subgraph completes execution it should be removed from the + /// list of active subgraphs, so that it can be restarted. + /// + /// This ID is required to be able to check if the active subgraph + /// is the same subgraph instance that was stopped. + /// + /// If the IDs do not match, it means that the subgraph was force restarted, + /// ignoring the state of the previous subgraph instance, or that the subgraph + /// was restarted after the previous subgraph instance completed its execution + /// but before the remove request was processed. + subgraph_instance_id: Arc, +} + +impl Monitor { + /// Creates a new subgraph monitor. + /// + /// Spawns a background process that manages the subgraph start and stop requests. + /// + /// A new cancel token is derived from the `cancel_token` and only the derived token is used by the + /// subgraph monitor and its background process. + pub(super) fn new(logger_factory: &LoggerFactory, cancel_token: &CancellationToken) -> Self { + let logger = logger_factory.component_logger("AmpSubgraphMonitor", None); + let logger_factory = Arc::new(logger_factory.with_parent(logger)); + + // A derived token makes sure it is not possible to accidentally cancel the parent token + let cancel_token = cancel_token.child_token(); + + // It is safe to use an unbounded channel here, because it's pretty much unrealistic that the + // command processor will fall behind so much that the channel buffer will take up all the memory. + // The command processor is non-blocking and delegates long-running processes to detached tasks. + let (command_tx, command_rx) = mpsc::unbounded_channel::(); + + tokio::spawn(Self::command_processor( + logger_factory.cheap_clone(), + cancel_token.cheap_clone(), + command_tx.clone(), + command_rx, + )); + + Self { + logger_factory, + cancel_token, + command_tx, + subgraph_instance_id: Arc::new(AtomicU32::new(0)), + } + } + + /// Starts a subgraph. + /// + /// Sends a subgraph start request to this subgraph monitor that + /// eventually starts the subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active, it starts when the request is processed + /// - If the subgraph is active, it stops, and then restarts + /// - Ensures that there is only one subgraph instance for this subgraph deployment + /// - Multiple consecutive calls in a short time period force restart the subgraph, + /// aborting the active subgraph instance + pub(super) fn start(&self, deployment: DeploymentLocator, runner: BoxRunner) { + let logger = self + .logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("method" => "start")); + + info!(logger, "Starting subgraph"); + handle_send_result( + &logger, + self.command_tx.send(Command::Start { + id: self.subgraph_instance_id.fetch_add(1, SeqCst), + deployment, + runner, + }), + ); + } + + /// Stops the subgraph. + /// + /// Sends a subgraph stop request to this subgraph monitor that + /// eventually stops the subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active does nothing + /// - If the subgraph is active, sends a cancel signal that gracefully stops the subgraph + /// - If the subgraph fails to stop after an extended period of time it aborts + pub(super) fn stop(&self, deployment: DeploymentLocator) { + let logger = self + .logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("method" => "stop")); + + info!(logger, "Stopping subgraph"); + handle_send_result(&logger, self.command_tx.send(Command::Stop { deployment })); + } + + /// Processes commands sent through the command channel. + /// + /// Tracks active subgraphs and keeps a list of pending start commands. + /// Pending start commands are start commands that execute after the related subgraph stops. + async fn command_processor( + logger_factory: Arc, + cancel_token: CancellationToken, + command_tx: mpsc::UnboundedSender, + mut command_rx: mpsc::UnboundedReceiver, + ) { + let logger = logger_factory.component_logger("CommandProcessor", None); + let mut subgraph_instances: HashMap = HashMap::new(); + let mut pending_start_commands: HashMap = HashMap::new(); + + loop { + tokio::select! { + Some(command) = command_rx.recv() => { + debug!(logger, "Processing a new command"; + "command" => ?command + ); + + match &command { + Command::Start { .. } => { + Self::process_start_command( + &logger_factory, + &cancel_token, + &mut subgraph_instances, + &mut pending_start_commands, + &command_tx, + command + ); + }, + Command::Stop { .. } => { + Self::process_stop_command( + &logger_factory, + &mut subgraph_instances, + &mut pending_start_commands, + command + ); + }, + Command::Clear { .. } => { + Self::process_clear_command( + &logger_factory, + &mut subgraph_instances, + &mut pending_start_commands, + &command_tx, + command + ); + }, + } + }, + _ = cancel_token.cancelled() => { + debug!(logger, "Stopping command processor"); + + // All active subgraphs will shutdown gracefully + // because their cancel tokens are derived from this cancelled token. + return; + } + } + } + } + + /// Starts a subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active, it starts right away + /// - If the subgraph is active, a cancel signal is sent to the active subgraph instance + /// and this start request is stored in the list of pending start commands + /// - If the subgraph is active and there is already a pending start command, + /// the active subgraph instance aborts, and the subgraph force restarts right away + /// - If the subgraph is active, but its instance is not actually running, + /// the subgraph starts right away + fn process_start_command( + logger_factory: &LoggerFactory, + cancel_token: &CancellationToken, + subgraph_instances: &mut HashMap, + pending_start_commands: &mut HashMap, + command_tx: &mpsc::UnboundedSender, + command: Command, + ) { + let Command::Start { + id, + deployment, + runner, + } = command + else { + unreachable!(); + }; + + let logger = logger_factory.subgraph_logger(&deployment); + let command_logger = logger.new(slog::o!("command" => "start")); + + let cancel_token = cancel_token.child_token(); + let pending_start_command = pending_start_commands.remove(&deployment); + + match subgraph_instances.entry(deployment.cheap_clone()) { + Entry::Vacant(entry) => { + debug!(command_logger, "Subgraph is not active, starting"); + + let subgraph_instance = Self::start_subgraph( + logger, + cancel_token, + id, + deployment, + runner, + command_tx.clone(), + ); + + entry.insert(subgraph_instance); + } + Entry::Occupied(mut entry) => { + let subgraph_instance = entry.get_mut(); + subgraph_instance.cancel_token.cancel(); + + if pending_start_command.is_some() { + debug!(command_logger, "Subgraph is active, force restarting"); + + subgraph_instance.handle.abort(); + + *subgraph_instance = Self::start_subgraph( + logger, + cancel_token, + id, + deployment, + runner, + command_tx.clone(), + ); + + return; + } + + if subgraph_instance.handle.is_finished() { + debug!(command_logger, "Subgraph is not running, starting"); + + *subgraph_instance = Self::start_subgraph( + logger, + cancel_token, + id, + deployment, + runner, + command_tx.clone(), + ); + + return; + } + + debug!(command_logger, "Gracefully restarting subgraph"); + + pending_start_commands.insert( + deployment.cheap_clone(), + Command::Start { + id, + deployment, + runner, + }, + ); + } + } + } + + /// Stops a subgraph. + /// + /// # Behaviour + /// + /// - If the subgraph is not active, does nothing + /// - If the subgraph is active, sends a cancel signal to the active subgraph instance + fn process_stop_command( + logger_factory: &LoggerFactory, + subgraph_instances: &mut HashMap, + pending_start_commands: &mut HashMap, + command: Command, + ) { + let Command::Stop { deployment } = command else { + unreachable!(); + }; + + let logger = logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("command" => "stop")); + + if let Some(subgraph_instance) = subgraph_instances.get(&deployment) { + debug!(logger, "Sending cancel signal"); + subgraph_instance.cancel_token.cancel(); + } else { + debug!(logger, "Subgraph is not active"); + } + + pending_start_commands.remove(&deployment); + } + + /// Removes a subgraph from the list of active subgraphs allowing the subgraph to be restarted. + fn process_clear_command( + logger_factory: &LoggerFactory, + subgraph_instances: &mut HashMap, + pending_start_commands: &mut HashMap, + command_tx: &mpsc::UnboundedSender, + command: Command, + ) { + let Command::Clear { id, deployment } = command else { + unreachable!(); + }; + + let logger = logger_factory + .subgraph_logger(&deployment) + .new(slog::o!("command" => "clear")); + + match subgraph_instances.get(&deployment) { + Some(subgraph_instance) if subgraph_instance.id == id => { + debug!(logger, "Removing active subgraph"); + subgraph_instances.remove(&deployment); + } + Some(_subgraph_instance) => { + debug!(logger, "Active subgraph does not need to be removed"); + return; + } + None => { + debug!(logger, "Subgraph is not active"); + } + } + + if let Some(pending_start_command) = pending_start_commands.remove(&deployment) { + debug!(logger, "Resending a pending start command"); + handle_send_result(&logger, command_tx.send(pending_start_command)); + } + } + + /// Spawns a background process that executes the subgraph runner future. + /// + /// An additional background process is spawned to handle the graceful shutdown of the subgraph runner, + /// and to ensure correct behaviour even if the subgraph runner panics. + fn start_subgraph( + logger: Logger, + cancel_token: CancellationToken, + id: u32, + deployment: DeploymentLocator, + runner: BoxRunner, + command_tx: mpsc::UnboundedSender, + ) -> SubgraphInstance { + let mut runner_handle = tokio::spawn({ + let logger = logger.new(slog::o!("process" => "subgraph_runner")); + let cancel_token = cancel_token.cheap_clone(); + + async move { + info!(logger, "Subgraph started"); + + match runner(cancel_token).await { + Ok(()) => { + info!(logger, "Subgraph stopped"); + } + Err(e) => { + error!(logger, "Subgraph failed"; + "error" => ?e + ); + } + } + } + }); + + let supervisor_handle = tokio::spawn({ + let logger = logger.new(slog::o!("process" => "subgraph_supervisor")); + let cancel_token = cancel_token.cheap_clone(); + + fn handle_runner_result(logger: &Logger, result: Result<(), tokio::task::JoinError>) { + match result { + Ok(()) => { + debug!(logger, "Subgraph completed execution"); + } + Err(e) if e.is_panic() => { + error!(logger, "Subgraph panicked"; + "error" => ?e + ); + + // TODO: Maybe abort the entire process on panic and require a full graph-node restart. + // Q: Should a bug that is triggered in a specific subgraph affect everything? + // Q: How to make this failure loud enough so it is not missed? + // + // println!("Subgraph panicked"); + // std::process::abort(); + } + Err(e) => { + error!(logger, "Subgraph failed"; + "error" => ?e + ); + } + } + } + + async move { + debug!(logger, "Subgraph supervisor started"); + + tokio::select! { + _ = cancel_token.cancelled() => { + debug!(logger, "Received cancel signal, waiting for subgraph to stop"); + + match timeout(SUBGRAPH_INSTANCE_GRACE_PERIOD, &mut runner_handle).await { + Ok(result) => { + handle_runner_result(&logger, result); + }, + Err(_) => { + warn!(logger, "Subgraph did not stop after grace period, aborting"); + + runner_handle.abort(); + let _ = runner_handle.await; + + warn!(logger, "Subgraph aborted"); + } + } + }, + result = &mut runner_handle => { + handle_runner_result(&logger, result); + cancel_token.cancel(); + } + } + + debug!(logger, "Sending clear command"); + handle_send_result(&logger, command_tx.send(Command::Clear { id, deployment })); + } + }); + + SubgraphInstance { + id, + handle: supervisor_handle, + cancel_token, + } + } +} + +impl Drop for Monitor { + fn drop(&mut self) { + // Send cancel signals to all active subgraphs so that they don't remain without an associated monitor + self.cancel_token.cancel(); + } +} + +/// Represents a background process that executes the subgraph runner future. +struct SubgraphInstance { + id: u32, + handle: JoinHandle<()>, + cancel_token: CancellationToken, +} + +/// Every command used by the subgraph monitor. +enum Command { + /// A request to start executing the subgraph runner future. + Start { + id: u32, + deployment: DeploymentLocator, + runner: BoxRunner, + }, + + /// A request to stop executing the subgraph runner future. + Stop { deployment: DeploymentLocator }, + + /// A request to remove the subgraph from the list of active subgraphs. + Clear { + id: u32, + deployment: DeploymentLocator, + }, +} + +impl fmt::Debug for Command { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Start { + id, + deployment, + runner: _, + } => f + .debug_struct("Start") + .field("id", id) + .field("deployment", deployment) + .finish_non_exhaustive(), + Self::Stop { deployment } => f + .debug_struct("Stop") + .field("deployment", deployment) + .finish(), + Self::Clear { id, deployment } => f + .debug_struct("Clear") + .field("id", id) + .field("deployment", deployment) + .finish(), + } + } +} + +fn handle_send_result( + logger: &Logger, + result: Result<(), tokio::sync::mpsc::error::SendError>, +) { + match result { + Ok(()) => { + debug!(logger, "Command was sent successfully"); + } + + // This should only happen if the parent cancel token of the subgraph monitor was cancelled + Err(e) => { + error!(logger, "Failed to send command"; + "command" => ?e.0, + "error" => ?e + ); + } + } +} diff --git a/core/src/amp_subgraph/runner/compat.rs b/core/src/amp_subgraph/runner/compat.rs new file mode 100644 index 00000000000..c695238416c --- /dev/null +++ b/core/src/amp_subgraph/runner/compat.rs @@ -0,0 +1,50 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use chrono::{DateTime, Utc}; + +mod legacy { + pub(super) use graph::{ + blockchain::{BlockHash, BlockPtr, BlockTime}, + components::store::BlockNumber, + data::store::scalar::Timestamp, + }; +} + +pub(super) trait Compat { + fn compat(&self) -> T; +} + +impl Compat for BlockNumber { + fn compat(&self) -> legacy::BlockNumber { + (*self).try_into().unwrap() + } +} + +impl Compat for legacy::BlockNumber { + fn compat(&self) -> BlockNumber { + (*self).try_into().unwrap() + } +} + +impl Compat for BlockHash { + fn compat(&self) -> legacy::BlockHash { + legacy::BlockHash(self.0.into()) + } +} + +impl Compat for legacy::BlockHash { + fn compat(&self) -> BlockHash { + BlockHash::from_slice(&self.0) + } +} + +impl Compat for DateTime { + fn compat(&self) -> legacy::BlockTime { + legacy::Timestamp(*self).into() + } +} + +impl Compat for (BlockNumber, BlockHash) { + fn compat(&self) -> legacy::BlockPtr { + legacy::BlockPtr::new(self.1.compat(), self.0.compat()) + } +} diff --git a/core/src/amp_subgraph/runner/context.rs b/core/src/amp_subgraph/runner/context.rs new file mode 100644 index 00000000000..32e96148acf --- /dev/null +++ b/core/src/amp_subgraph/runner/context.rs @@ -0,0 +1,105 @@ +use std::sync::Arc; + +use alloy::primitives::{BlockHash, BlockNumber}; +use graph::{ + amp::{log::Logger as _, Codec, Manifest}, + cheap_clone::CheapClone, + components::store::WritableStore, + data::subgraph::DeploymentHash, + env::AmpEnv, + util::backoff::ExponentialBackoff, +}; +use slog::Logger; + +use super::Compat; +use crate::amp_subgraph::Metrics; + +pub(in super::super) struct Context { + pub(super) logger: Logger, + pub(super) client: Arc, + pub(super) store: Arc, + pub(super) max_buffer_size: usize, + pub(super) max_block_range: usize, + pub(super) backoff: ExponentialBackoff, + pub(super) deployment: DeploymentHash, + pub(super) manifest: Manifest, + pub(super) metrics: Metrics, + pub(super) codec: Codec, +} + +impl Context { + pub(in super::super) fn new( + logger: &Logger, + env: &AmpEnv, + client: Arc, + store: Arc, + deployment: DeploymentHash, + manifest: Manifest, + metrics: Metrics, + ) -> Self { + let logger = logger.component("AmpSubgraphRunner"); + let backoff = ExponentialBackoff::new(env.query_retry_min_delay, env.query_retry_max_delay); + let codec = Codec::new(manifest.schema.cheap_clone()); + + Self { + logger, + client, + store, + max_buffer_size: env.max_buffer_size, + max_block_range: env.max_block_range, + backoff, + deployment, + manifest, + metrics, + codec, + } + } + + pub(super) fn indexing_completed(&self) -> bool { + let Some(last_synced_block) = self.latest_synced_block() else { + return false; + }; + + self.manifest + .data_sources + .iter() + .all(|data_source| last_synced_block >= data_source.source.end_block) + } + + pub(super) fn latest_synced_block(&self) -> Option { + self.latest_synced_block_ptr() + .map(|(block_number, _)| block_number) + } + + pub(super) fn latest_synced_block_ptr(&self) -> Option<(BlockNumber, BlockHash)> { + self.store + .block_ptr() + .map(|block_ptr| (block_ptr.number.compat(), block_ptr.hash.compat())) + } + + pub(super) fn total_queries(&self) -> usize { + self.manifest + .data_sources + .iter() + .map(|data_source| data_source.transformer.tables.len()) + .sum() + } + + pub(super) fn min_start_block(&self) -> BlockNumber { + self.manifest + .data_sources + .iter() + .map(|data_source| data_source.source.start_block) + .min() + .unwrap() + } + + pub(super) fn max_end_block(&self) -> BlockNumber { + self.manifest + .data_sources + .iter() + .map(|data_source| data_source.source.end_block) + .max() + .unwrap() + } +} diff --git a/core/src/amp_subgraph/runner/data_processing.rs b/core/src/amp_subgraph/runner/data_processing.rs new file mode 100644 index 00000000000..2801abc66aa --- /dev/null +++ b/core/src/amp_subgraph/runner/data_processing.rs @@ -0,0 +1,252 @@ +use std::sync::Arc; + +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::anyhow; +use arrow::array::RecordBatch; +use chrono::{DateTime, Utc}; +use graph::{ + amp::{ + codec::{utils::auto_block_timestamp_decoder, DecodeOutput, DecodedEntity, Decoder}, + stream_aggregator::{RecordBatchGroup, RecordBatchGroups, StreamRecordBatch}, + }, + blockchain::block_stream::FirehoseCursor, + cheap_clone::CheapClone, + components::store::{EntityCache, ModificationsAndCache}, +}; +use slog::{debug, trace}; + +use super::{data_stream::TablePtr, Compat, Context, Error}; + +pub(super) async fn process_record_batch_groups( + cx: &mut Context, + mut entity_cache: EntityCache, + record_batch_groups: RecordBatchGroups, + stream_table_ptr: Arc<[TablePtr]>, + latest_block: BlockNumber, +) -> Result { + if record_batch_groups.is_empty() { + debug!(cx.logger, "Received no record batch groups"); + return Ok(entity_cache); + } + + let from_block = record_batch_groups + .first_key_value() + .map(|((block, _), _)| *block) + .unwrap(); + + let to_block = record_batch_groups + .last_key_value() + .map(|((block, _), _)| *block) + .unwrap(); + + debug!(cx.logger, "Processing record batch groups"; + "from_block" => from_block, + "to_block" => to_block + ); + + for ((block_number, block_hash), record_batch_group) in record_batch_groups { + trace!(cx.logger, "Processing record batch group"; + "block" => block_number, + "record_batches_count" => record_batch_group.record_batches.len() + ); + + entity_cache = process_record_batch_group( + cx, + entity_cache, + block_number, + block_hash, + record_batch_group, + &stream_table_ptr, + latest_block, + ) + .await + .map_err(|e| { + e.context(format!( + "failed to process record batch group at block '{block_number}'" + )) + })?; + + cx.metrics.deployment_head.update(block_number); + cx.metrics.blocks_processed.record_one(); + + trace!(cx.logger, "Completed processing record batch group"; + "block" => block_number + ); + } + + debug!(cx.logger, "Completed processing record batch groups"; + "from_block" => from_block, + "to_block" => to_block + ); + + Ok(entity_cache) +} + +async fn process_record_batch_group( + cx: &mut Context, + mut entity_cache: EntityCache, + block_number: BlockNumber, + block_hash: BlockHash, + record_batch_group: RecordBatchGroup, + stream_table_ptr: &[TablePtr], + latest_block: BlockNumber, +) -> Result { + let _section = cx + .metrics + .stopwatch + .start_section("process_record_batch_group"); + + let RecordBatchGroup { record_batches } = record_batch_group; + + if record_batches.is_empty() { + debug!(cx.logger, "Record batch group is empty"); + return Ok(entity_cache); + } + + let block_timestamp = decode_block_timestamp(&record_batches) + .map_err(|e| e.context("failed to decode block timestamp"))?; + + for record_batch in record_batches { + let StreamRecordBatch { + stream_index, + record_batch, + } = record_batch; + + process_record_batch( + cx, + &mut entity_cache, + block_number, + record_batch, + stream_table_ptr[stream_index], + ) + .await + .map_err(|e| { + e.context(format!( + "failed to process record batch for stream '{stream_index}'" + )) + })?; + } + + let ModificationsAndCache { + modifications, + entity_lfu_cache, + evict_stats: _, + } = entity_cache + .as_modifications(block_number.compat()) + .map_err(Error::from) + .map_err(|e| e.context("failed to extract entity modifications from the state"))?; + + let is_close_to_chain_head = latest_block.saturating_sub(block_number) <= 100; + + cx.store + .transact_block_operations( + (block_number, block_hash).compat(), + block_timestamp.compat(), + FirehoseCursor::None, + modifications, + &cx.metrics.stopwatch, + Vec::new(), + Vec::new(), + Vec::new(), + false, + is_close_to_chain_head, + ) + .await + .map_err(Error::from) + .map_err(|e| e.context("failed to transact block operations"))?; + + if is_close_to_chain_head { + cx.metrics.deployment_synced.record(true); + } + + Ok(EntityCache::with_current( + cx.store.cheap_clone(), + entity_lfu_cache, + )) +} + +async fn process_record_batch( + cx: &mut Context, + entity_cache: &mut EntityCache, + block_number: BlockNumber, + record_batch: RecordBatch, + (i, j): TablePtr, +) -> Result<(), Error> { + let _section = cx.metrics.stopwatch.start_section("process_record_batch"); + + let table = &cx.manifest.data_sources[i].transformer.tables[j]; + let entity_name = &table.name; + + let DecodeOutput { + entity_type, + id_type, + decoded_entities, + } = cx + .codec + .decode(record_batch, entity_name.as_str()) + .map_err(|e| { + Error::Deterministic( + e.context(format!("failed to decode entities of type '{entity_name}'")), + ) + })?; + + for decoded_entity in decoded_entities { + let DecodedEntity { + key, + mut entity_data, + } = decoded_entity; + + let key = match key { + Some(key) => key, + None => { + let entity_id = entity_cache + .generate_id(id_type, block_number.compat()) + .map_err(|e| { + Error::Deterministic(e.context(format!( + "failed to generate a new id for an entity of type '{entity_name}'" + ))) + })?; + + entity_data.push(("id".into(), entity_id.clone().into())); + entity_type.key(entity_id) + } + }; + + let entity_id = key.entity_id.clone(); + let entity = cx.manifest.schema.make_entity(entity_data).map_err(|e| { + Error::Deterministic(anyhow!(e).context(format!( + "failed to create a new entity of type '{entity_name}' with id '{entity_id}'" + ))) + })?; + + entity_cache + .set(key, entity, block_number.compat(), None) + .map_err(|e| { + Error::Deterministic(e.context(format!( + "failed to store a new entity of type '{entity_name}' with id '{entity_id}'" + ))) + })?; + } + + Ok(()) +} + +fn decode_block_timestamp(record_batches: &[StreamRecordBatch]) -> Result, Error> { + let mut last_error: Option = None; + + for record_batch in record_batches { + match auto_block_timestamp_decoder(&record_batch.record_batch) { + Ok((_, decoder)) => { + return decoder + .decode(0) + .map_err(|e| Error::Deterministic(e))? + .ok_or_else(|| Error::Deterministic(anyhow!("block timestamp is empty"))); + } + Err(e) => { + last_error = Some(Error::Deterministic(e)); + } + } + } + + Err(last_error.unwrap()) +} diff --git a/core/src/amp_subgraph/runner/data_stream.rs b/core/src/amp_subgraph/runner/data_stream.rs new file mode 100644 index 00000000000..7f3636a5af9 --- /dev/null +++ b/core/src/amp_subgraph/runner/data_stream.rs @@ -0,0 +1,206 @@ +use std::{collections::HashMap, ops::RangeInclusive, sync::Arc}; + +use alloy::primitives::BlockNumber; +use anyhow::anyhow; +use futures::{ + stream::{empty, BoxStream}, + StreamExt, TryStreamExt, +}; +use graph::{ + amp::{ + manifest::DataSource, + stream_aggregator::{RecordBatchGroups, StreamAggregator}, + Client, + }, + cheap_clone::CheapClone, +}; +use slog::{debug, warn}; + +use super::{Context, Error}; + +pub(super) type TablePtr = (usize, usize); + +pub(super) fn new_data_stream( + cx: &Context, + latest_block: BlockNumber, +) -> BoxStream<'static, Result<(RecordBatchGroups, Arc<[TablePtr]>), Error>> +where + AC: Client, +{ + let logger = cx.logger.new(slog::o!("process" => "new_data_stream")); + + let total_queries = cx.total_queries(); + let mut total_queries_to_execute = 0; + let mut data_streams = Vec::new(); + let mut latest_queried_block = cx.latest_synced_block(); + let mut max_end_block = BlockNumber::MIN; + + debug!(logger, "Creating data stream"; + "from_block" => latest_queried_block.unwrap_or(BlockNumber::MIN), + "to_block" => latest_block, + "min_start_block" => cx.min_start_block(), + "max_block_range" => cx.max_block_range, + ); + + loop { + let next_block_ranges = next_block_ranges(&cx, latest_queried_block, latest_block); + + if next_block_ranges.is_empty() { + if data_streams.is_empty() { + warn!(logger, "There are no unprocessed block ranges"); + } + break; + } + + let mut query_streams = Vec::with_capacity(total_queries); + let mut query_streams_table_ptr = Vec::with_capacity(total_queries); + let mut min_start_block = BlockNumber::MAX; + + for (i, data_source) in cx.manifest.data_sources.iter().enumerate() { + let Some(block_range) = next_block_ranges.get(&i) else { + continue; + }; + + if *block_range.start() < min_start_block { + min_start_block = *block_range.start(); + } + + if *block_range.end() > max_end_block { + max_end_block = *block_range.end(); + } + + for (j, table) in data_source.transformer.tables.iter().enumerate() { + let query = table.query.build_with_block_range(block_range); + let stream = cx.client.query(&cx.logger, query, None); + let stream_name = format!("{}.{}", data_source.name, table.name); + + query_streams.push((stream_name, stream)); + query_streams_table_ptr.push((i, j)); + } + } + + let query_streams_table_ptr: Arc<[TablePtr]> = query_streams_table_ptr.into(); + total_queries_to_execute += query_streams.len(); + + let mut min_start_block_checked = false; + let mut load_first_record_batch_group_section = Some( + cx.metrics + .stopwatch + .start_section("load_first_record_batch_group"), + ); + + data_streams.push( + StreamAggregator::new(&cx.logger, query_streams, cx.max_buffer_size) + .map_ok(move |response| (response, query_streams_table_ptr.cheap_clone())) + .map_err(Error::from) + .map(move |result| { + if load_first_record_batch_group_section.is_some() { + let _section = load_first_record_batch_group_section.take(); + } + + match result { + Ok(response) => { + if !min_start_block_checked { + if let Some(((first_block, _), _)) = response.0.first_key_value() { + if *first_block < min_start_block { + return Err(Error::NonDeterministic(anyhow!( + "chain reorg" + ))); + } + } + + min_start_block_checked = true; + } + + Ok(response) + } + Err(e) => Err(e), + } + }) + .boxed(), + ); + + if max_end_block >= latest_block { + break; + } + + latest_queried_block = Some(max_end_block); + } + + debug!(logger, "Created aggregated data streams"; + "total_data_streams" => data_streams.len(), + "total_queries_to_execute" => total_queries_to_execute + ); + + let mut iter = data_streams.into_iter(); + let mut merged_data_stream = iter.next().unwrap_or_else(|| empty().boxed()); + + for data_stream in iter { + merged_data_stream = merged_data_stream.chain(data_stream).boxed(); + } + + merged_data_stream +} + +fn next_block_ranges( + cx: &Context, + latest_queried_block: Option, + latest_block: BlockNumber, +) -> HashMap> { + let block_ranges = cx + .manifest + .data_sources + .iter() + .enumerate() + .filter_map(|(i, data_source)| { + next_block_range(cx, data_source, latest_queried_block, latest_block) + .map(|block_range| (i, block_range)) + }) + .collect::>(); + + let Some(min_block_range) = block_ranges + .iter() + .min_by_key(|(_, block_range)| *block_range.start()) + .map(|(_, min_block_range)| min_block_range.clone()) + else { + return HashMap::new(); + }; + + block_ranges + .into_iter() + .filter(|(_, block_range)| block_range.start() <= min_block_range.end()) + .collect() +} + +fn next_block_range( + cx: &Context, + data_source: &DataSource, + latest_queried_block: Option, + latest_block: BlockNumber, +) -> Option> { + let start_block = match latest_queried_block { + Some(latest_queried_block) => { + if latest_queried_block >= data_source.source.end_block { + return None; + } + + latest_queried_block + 1 + } + None => data_source.source.start_block, + }; + + let end_block = [ + start_block.saturating_add(cx.max_block_range as BlockNumber), + data_source.source.end_block, + latest_block, + ] + .into_iter() + .min() + .unwrap(); + + if start_block > end_block { + return None; + } + + Some(start_block..=end_block) +} diff --git a/core/src/amp_subgraph/runner/error.rs b/core/src/amp_subgraph/runner/error.rs new file mode 100644 index 00000000000..8c7077e1c68 --- /dev/null +++ b/core/src/amp_subgraph/runner/error.rs @@ -0,0 +1,43 @@ +use graph::amp::error::IsDeterministic; +use thiserror::Error; + +#[derive(Debug, Error)] +pub(super) enum Error { + #[error("runner failed with a non-deterministic error: {0:#}")] + NonDeterministic(#[source] anyhow::Error), + + #[error("runner failed with a deterministic error: {0:#}")] + Deterministic(#[source] anyhow::Error), +} + +impl Error { + pub(super) fn context(self, context: C) -> Self + where + C: std::fmt::Display + Send + Sync + 'static, + { + match self { + Self::NonDeterministic(e) => Self::NonDeterministic(e.context(context)), + Self::Deterministic(e) => Self::Deterministic(e.context(context)), + } + } + + pub(super) fn is_deterministic(&self) -> bool { + match self { + Self::Deterministic(_) => true, + Self::NonDeterministic(_) => false, + } + } +} + +impl From for Error +where + T: std::error::Error + IsDeterministic + Send + Sync + 'static, +{ + fn from(e: T) -> Self { + if e.is_deterministic() { + Self::Deterministic(e.into()) + } else { + Self::NonDeterministic(e.into()) + } + } +} diff --git a/core/src/amp_subgraph/runner/latest_blocks.rs b/core/src/amp_subgraph/runner/latest_blocks.rs new file mode 100644 index 00000000000..559aef963cd --- /dev/null +++ b/core/src/amp_subgraph/runner/latest_blocks.rs @@ -0,0 +1,179 @@ +use alloy::primitives::BlockNumber; +use anyhow::anyhow; +use arrow::array::RecordBatch; +use futures::{future::try_join_all, stream::BoxStream, StreamExt, TryFutureExt}; +use graph::amp::{ + client::ResponseBatch, + codec::{utils::block_number_decoder, Decoder}, + error::IsDeterministic, + manifest::DataSource, + Client, +}; +use itertools::Itertools; +use slog::debug; + +use super::{Context, Error}; + +pub(super) type TablePtr = (usize, usize); + +pub(super) struct LatestBlocks(Vec<(TablePtr, BlockNumber)>); + +impl LatestBlocks { + pub(super) async fn load(cx: &Context) -> Result + where + AC: Client, + { + debug!(cx.logger, "Loading latest blocks"); + let _section = cx.metrics.stopwatch.start_section("load_latest_blocks"); + + let latest_block_futs = cx + .manifest + .data_sources + .iter() + .enumerate() + .map(|(i, data_source)| { + data_source + .source + .tables + .iter() + .enumerate() + .map(move |(j, table)| ((i, j), &data_source.source.dataset, table)) + }) + .flatten() + .unique_by(|(_, dataset, table)| (dataset.to_string(), table.to_string())) + .map(|(table_ptr, dataset, table)| { + latest_block(&cx, dataset, table) + .map_ok(move |latest_block| (table_ptr, latest_block)) + .map_err(move |e| { + e.context(format!( + "failed to load latest block for '{dataset}.{table}'" + )) + }) + }); + + try_join_all(latest_block_futs).await.map(Self) + } + + pub(super) fn filter_completed(self, cx: &Context) -> Self + where + AC: Client, + { + let latest_synced_block = cx.latest_synced_block(); + + Self( + self.0 + .into_iter() + .filter(|((i, _), _)| { + !indexing_completed(&cx.manifest.data_sources[*i], &latest_synced_block) + }) + .collect(), + ) + } + + pub(super) fn min(&self) -> BlockNumber { + self.0 + .iter() + .min_by_key(|(_, latest_block)| *latest_block) + .map(|(_, latest_block)| *latest_block) + .unwrap() + } + + pub(super) async fn changed(self, cx: &Context) -> Result<(), Error> + where + AC: Client, + { + debug!(cx.logger, "Waiting for new blocks"); + let _section = cx.metrics.stopwatch.start_section("latest_blocks_changed"); + + let min_latest_block = self.min(); + let latest_synced_block = cx.latest_synced_block(); + + let latest_block_changed_futs = self + .0 + .into_iter() + .filter(|(_, latest_block)| *latest_block == min_latest_block) + .filter(|((i, _), _)| { + !indexing_completed(&cx.manifest.data_sources[*i], &latest_synced_block) + }) + .map(|((i, j), latest_block)| { + let source = &cx.manifest.data_sources[i].source; + let dataset = &source.dataset; + let table = &source.tables[j]; + + latest_block_changed(&cx, dataset, table, latest_block).map_err(move |e| { + e.context(format!( + "failed to check if the latest block changed in '{dataset}.{table}'" + )) + }) + }); + + let _response = try_join_all(latest_block_changed_futs).await?; + + Ok(()) + } + + pub(super) fn iter(&self) -> impl Iterator { + self.0.iter() + } +} + +fn indexing_completed(data_source: &DataSource, latest_synced_block: &Option) -> bool { + latest_synced_block + .as_ref() + .is_some_and(|latest_synced_block| *latest_synced_block >= data_source.source.end_block) +} + +async fn latest_block( + cx: &Context, + dataset: &str, + table: &str, +) -> Result +where + AC: Client, +{ + let query = format!("SELECT MAX(_block_num) FROM {dataset}.{table}"); + let stream = cx.client.query(&cx.logger, query, None); + let record_batch = read_once(stream).await?; + + let latest_block = block_number_decoder(&record_batch, 0) + .map_err(|e| Error::Deterministic(e))? + .decode(0) + .map_err(|e| Error::Deterministic(e))? + .ok_or_else(|| Error::NonDeterministic(anyhow!("table is empty")))?; + + Ok(latest_block) +} + +async fn latest_block_changed( + cx: &Context, + dataset: &str, + table: &str, + latest_block: BlockNumber, +) -> Result<(), Error> +where + AC: Client, +{ + let query = format!("SELECT _block_num FROM {dataset}.{table} WHERE _block_num > {latest_block} SETTINGS stream = true"); + let stream = cx.client.query(&cx.logger, query, None); + let _record_batch = read_once(stream).await?; + + Ok(()) +} + +async fn read_once( + mut stream: BoxStream<'static, Result>, +) -> Result +where + E: std::error::Error + IsDeterministic + Send + Sync + 'static, +{ + let response = stream + .next() + .await + .ok_or_else(|| Error::NonDeterministic(anyhow!("stream is empty")))? + .map_err(Error::from)?; + + match response { + ResponseBatch::Batch { data } => Ok(data), + _ => Err(Error::NonDeterministic(anyhow!("response is empty"))), + } +} diff --git a/core/src/amp_subgraph/runner/mod.rs b/core/src/amp_subgraph/runner/mod.rs new file mode 100644 index 00000000000..8fee0e9fda4 --- /dev/null +++ b/core/src/amp_subgraph/runner/mod.rs @@ -0,0 +1,181 @@ +mod compat; +mod context; +mod data_processing; +mod data_stream; +mod error; +mod latest_blocks; +mod reorg_handler; + +use std::time::{Duration, Instant}; + +use anyhow::Result; +use futures::{future::BoxFuture, StreamExt}; +use graph::{ + amp::Client, cheap_clone::CheapClone, components::store::EntityCache, + data::subgraph::schema::SubgraphError, +}; +use slog::{debug, error, warn}; +use tokio_util::sync::CancellationToken; + +use self::{ + compat::Compat, data_processing::process_record_batch_groups, data_stream::new_data_stream, + error::Error, latest_blocks::LatestBlocks, reorg_handler::check_and_handle_reorg, +}; + +pub(super) use self::context::Context; + +pub(super) fn new_runner( + mut cx: Context, +) -> Box BoxFuture<'static, Result<()>> + Send + 'static> +where + AC: Client + Send + Sync + 'static, +{ + Box::new(move |cancel_token| { + Box::pin(async move { + let indexing_duration_handle = tokio::spawn({ + let mut instant = Instant::now(); + let indexing_duration = cx.metrics.indexing_duration.clone(); + + async move { + loop { + tokio::time::sleep(Duration::from_secs(1)).await; + + let prev_instant = std::mem::replace(&mut instant, Instant::now()); + indexing_duration.record(prev_instant.elapsed()); + } + } + }); + + let result = cancel_token + .run_until_cancelled(run_indexing_with_retries(&mut cx)) + .await; + + indexing_duration_handle.abort(); + + match result { + Some(result) => result?, + None => { + debug!(cx.logger, "Processed cancel signal"); + } + } + + cx.metrics.deployment_status.stopped(); + + debug!(cx.logger, "Waiting for the store to finish processing"); + cx.store.flush().await?; + Ok(()) + }) + }) +} + +async fn run_indexing(cx: &mut Context) -> Result<(), Error> +where + AC: Client, +{ + cx.metrics.deployment_status.starting(); + + if let Some(latest_synced_block) = cx.latest_synced_block() { + cx.metrics.deployment_head.update(latest_synced_block); + } + + cx.metrics + .deployment_synced + .record(cx.store.is_deployment_synced()); + + loop { + cx.metrics.deployment_status.running(); + + debug!(cx.logger, "Running indexing"; + "latest_synced_block_ptr" => ?cx.latest_synced_block_ptr() + ); + + let mut latest_blocks = LatestBlocks::load(cx).await?; + check_and_handle_reorg(cx, &latest_blocks).await?; + + if cx.indexing_completed() { + cx.metrics.deployment_synced.record(true); + + debug!(cx.logger, "Indexing completed"); + return Ok(()); + } + + latest_blocks = latest_blocks.filter_completed(cx); + let latest_block = latest_blocks.min(); + + cx.metrics + .deployment_target + .update(latest_block.min(cx.max_end_block())); + + let mut deployment_is_failed = cx.store.health().await?.is_failed(); + let mut entity_cache = EntityCache::new(cx.store.cheap_clone()); + let mut stream = new_data_stream(cx, latest_block); + + while let Some(result) = stream.next().await { + let (record_batch_groups, stream_table_ptr) = result?; + + entity_cache = process_record_batch_groups( + cx, + entity_cache, + record_batch_groups, + stream_table_ptr, + latest_block, + ) + .await?; + + if deployment_is_failed { + if let Some(block_ptr) = cx.store.block_ptr() { + cx.store.unfail_non_deterministic_error(&block_ptr)?; + deployment_is_failed = false; + } + } + } + + debug!(cx.logger, "Completed indexing iteration"; + "latest_synced_block_ptr" => ?cx.latest_synced_block_ptr() + ); + + latest_blocks.changed(cx).await?; + cx.backoff.reset(); + } +} + +async fn run_indexing_with_retries(cx: &mut Context) -> Result<()> +where + AC: Client, +{ + loop { + match run_indexing(cx).await { + Ok(()) => return Ok(()), + Err(e) => { + cx.metrics.deployment_status.failed(); + + let deterministic = e.is_deterministic(); + + cx.store + .fail_subgraph(SubgraphError { + subgraph_id: cx.deployment.cheap_clone(), + message: format!("{e:#}"), + block_ptr: None, // TODO: Find a way to propagate the block ptr here + handler: None, + deterministic, + }) + .await?; + + if deterministic { + error!(cx.logger, "Subgraph failed with a deterministic error"; + "e" => ?e + ); + return Err(e.into()); + } + + warn!(cx.logger, "Subgraph failed with a non-deterministic error"; + "e" => ?e, + "retry_delay_seconds" => cx.backoff.delay().as_secs() + ); + + cx.backoff.sleep_async().await; + debug!(cx.logger, "Restarting indexing"); + } + } + } +} diff --git a/core/src/amp_subgraph/runner/reorg_handler.rs b/core/src/amp_subgraph/runner/reorg_handler.rs new file mode 100644 index 00000000000..911c4ebf818 --- /dev/null +++ b/core/src/amp_subgraph/runner/reorg_handler.rs @@ -0,0 +1,163 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::anyhow; +use futures::{future::try_join_all, StreamExt, TryFutureExt}; +use graph::{ + amp::{ + client::{LatestBlockBeforeReorg, RequestMetadata, ResponseBatch, ResumeStreamingQuery}, + Client, + }, + blockchain::block_stream::FirehoseCursor, +}; +use itertools::Itertools; +use slog::debug; + +use super::{Compat, Context, Error, LatestBlocks}; + +pub(super) async fn check_and_handle_reorg( + cx: &Context, + latest_blocks: &LatestBlocks, +) -> Result<(), Error> +where + AC: Client, +{ + let logger = cx + .logger + .new(slog::o!("process" => "check_and_handle_reorg")); + + let Some((latest_synced_block_number, latest_synced_block_hash)) = cx.latest_synced_block_ptr() + else { + debug!(logger, "There are no synced blocks; Skipping reorg check"); + return Ok(()); + }; + + debug!(logger, "Running reorg check"); + + let Some(latest_block_before_reorg) = detect_deepest_reorg( + cx, + latest_blocks, + latest_synced_block_number, + latest_synced_block_hash, + ) + .await? + else { + debug!(logger, "Successfully checked for reorg: No reorg detected"; + "latest_synced_block" => latest_synced_block_number + ); + return Ok(()); + }; + + let _section = cx.metrics.stopwatch.start_section("handle_reorg"); + + debug!(logger, "Handling reorg"; + "latest_synced_block" => latest_synced_block_number, + "latest_block_before_reorg" => ?latest_block_before_reorg.block_number + ); + + let (block_number, block_hash) = match ( + latest_block_before_reorg.block_number, + latest_block_before_reorg.block_hash, + ) { + (Some(block_number), Some(block_hash)) => (block_number, block_hash), + (_, _) => { + // TODO: Handle reorgs to the genesis block + return Err(Error::Deterministic(anyhow!( + "invalid reorg: rewind to the genesis block not supported" + ))); + } + }; + + if block_number > latest_synced_block_number { + return Err(Error::Deterministic(anyhow!( + "invalid reorg: latest block before reorg cannot be higher than the invalidated block" + ))); + } else if block_number == latest_synced_block_number && block_hash == latest_synced_block_hash { + return Err(Error::Deterministic(anyhow!( + "invalid reorg: latest block before reorg cannot be equal to the invalidated block" + ))); + } + + cx.store + .revert_block_operations((block_number, block_hash).compat(), FirehoseCursor::None) + .await + .map_err(Error::from)?; + + Ok(()) +} + +async fn detect_deepest_reorg( + cx: &Context, + latest_blocks: &LatestBlocks, + latest_synced_block_number: BlockNumber, + latest_synced_block_hash: BlockHash, +) -> Result, Error> +where + AC: Client, +{ + let detect_reorg_futs = latest_blocks + .iter() + .filter(|(_, latest_block)| *latest_block >= latest_synced_block_number) + .map(|((i, j), _)| { + let data_source = &cx.manifest.data_sources[*i]; + let network = &data_source.network; + let dataset = &data_source.source.dataset; + let table = &data_source.source.tables[*j]; + + detect_reorg( + &cx, + network, + dataset, + table, + latest_synced_block_number, + latest_synced_block_hash, + ) + .map_err(move |e| e.context(format!("failed to detect reorg in '{dataset}.{table}'"))) + }); + + let deepest_reorg = try_join_all(detect_reorg_futs) + .await? + .into_iter() + .flatten() + .min_by_key(|latest_block_before_reorg| latest_block_before_reorg.block_number); + + Ok(deepest_reorg) +} + +async fn detect_reorg( + cx: &Context, + network: &str, + dataset: &str, + table: &str, + latest_synced_block_number: BlockNumber, + latest_synced_block_hash: BlockHash, +) -> Result, Error> +where + AC: Client, +{ + let query = format!("SELECT _block_num FROM {dataset}.{table} SETTINGS stream = true"); + let mut stream = cx.client.query( + &cx.logger, + query, + Some(RequestMetadata { + resume_streaming_query: Some(vec![ResumeStreamingQuery { + network: network.to_string(), + block_number: latest_synced_block_number, + block_hash: latest_synced_block_hash, + }]), + }), + ); + + let response = stream + .next() + .await + .ok_or_else(|| Error::NonDeterministic(anyhow!("stream is empty")))? + .map_err(Error::from)?; + + match response { + ResponseBatch::Batch { .. } => Ok(None), + ResponseBatch::Reorg(reorg) => reorg + .into_iter() + .exactly_one() + .map_err(|_e| Error::Deterministic(anyhow!("multi-chain datasets are not supported"))) + .map(Some), + } +} diff --git a/core/src/lib.rs b/core/src/lib.rs index 448bb1041fd..61de81c0b64 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -1,8 +1,5 @@ +pub mod amp_subgraph; pub mod polling_monitor; -mod subgraph; - -pub use crate::subgraph::{ - SubgraphAssignmentProvider, SubgraphInstanceManager, SubgraphRegistrar, SubgraphRunner, - SubgraphTriggerProcessor, -}; +pub mod subgraph; +pub mod subgraph_provider; diff --git a/core/src/subgraph/context/instance/mod.rs b/core/src/subgraph/context/instance/mod.rs index 86b64195493..0d14ae8d758 100644 --- a/core/src/subgraph/context/instance/mod.rs +++ b/core/src/subgraph/context/instance/mod.rs @@ -182,6 +182,7 @@ where Ok(Some(host)) } } + DataSource::Amp(_) => unreachable!(), } } diff --git a/core/src/subgraph/instance_manager.rs b/core/src/subgraph/instance_manager.rs index 81c1a3ccd1a..77b4b7288f1 100644 --- a/core/src/subgraph/instance_manager.rs +++ b/core/src/subgraph/instance_manager.rs @@ -9,6 +9,7 @@ use crate::subgraph::Decoder; use std::collections::BTreeSet; use crate::subgraph::runner::SubgraphRunner; +use graph::amp; use graph::blockchain::block_stream::{BlockStreamMetrics, TriggersAdapterWrapper}; use graph::blockchain::{Blockchain, BlockchainKind, DataSource, NodeCapabilities}; use graph::components::link_resolver::LinkResolverContext; @@ -31,7 +32,7 @@ use super::SubgraphTriggerProcessor; use crate::subgraph::runner::SubgraphRunnerError; #[derive(Clone)] -pub struct SubgraphInstanceManager { +pub struct SubgraphInstanceManager { logger_factory: LoggerFactory, subgraph_store: Arc, chains: Arc, @@ -40,6 +41,7 @@ pub struct SubgraphInstanceManager { link_resolver: Arc, ipfs_service: IpfsService, arweave_service: ArweaveService, + amp_client: Option>, static_filters: bool, env_vars: Arc, @@ -57,7 +59,10 @@ pub struct SubgraphInstanceManager { } #[async_trait] -impl SubgraphInstanceManagerTrait for SubgraphInstanceManager { +impl SubgraphInstanceManagerTrait for SubgraphInstanceManager +where + NC: amp::Client + Send + Sync + 'static, +{ async fn start_subgraph( self: Arc, loc: DeploymentLocator, @@ -184,7 +189,7 @@ impl SubgraphInstanceManagerTrait for SubgraphInstanceManager< } } -impl SubgraphInstanceManager { +impl SubgraphInstanceManager { pub fn new( logger_factory: &LoggerFactory, env_vars: Arc, @@ -195,6 +200,7 @@ impl SubgraphInstanceManager { link_resolver: Arc, ipfs_service: IpfsService, arweave_service: ArweaveService, + amp_client: Option>, static_filters: bool, ) -> Self { let logger = logger_factory.component_logger("SubgraphInstanceManager", None); @@ -208,6 +214,7 @@ impl SubgraphInstanceManager { instances: SubgraphKeepAlive::new(sg_metrics), link_resolver, ipfs_service, + amp_client, static_filters, env_vars, arweave_service, @@ -325,6 +332,7 @@ impl SubgraphInstanceManager { .resolve( &deployment.hash, &link_resolver, + self.amp_client.cheap_clone(), &logger, ENV_VARS.max_spec_version.clone(), ) diff --git a/core/src/subgraph/mod.rs b/core/src/subgraph/mod.rs index 45f8d5b98ef..8f6bc932daa 100644 --- a/core/src/subgraph/mod.rs +++ b/core/src/subgraph/mod.rs @@ -3,7 +3,6 @@ mod error; mod inputs; mod instance_manager; mod loader; -mod provider; mod registrar; mod runner; mod state; @@ -11,7 +10,6 @@ mod stream; mod trigger_processor; pub use self::instance_manager::SubgraphInstanceManager; -pub use self::provider::SubgraphAssignmentProvider; pub use self::registrar::SubgraphRegistrar; pub use self::runner::SubgraphRunner; pub use self::trigger_processor::*; diff --git a/core/src/subgraph/provider.rs b/core/src/subgraph/provider.rs deleted file mode 100644 index 2ea4327838b..00000000000 --- a/core/src/subgraph/provider.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::sync::Mutex; -use std::{collections::HashSet, time::Instant}; - -use async_trait::async_trait; - -use graph::{ - components::store::{DeploymentId, DeploymentLocator}, - prelude::{SubgraphAssignmentProvider as SubgraphAssignmentProviderTrait, *}, -}; - -#[derive(Debug)] -struct DeploymentRegistry { - subgraphs_deployed: Arc>>, - subgraph_metrics: Arc, -} - -impl DeploymentRegistry { - fn new(subgraph_metrics: Arc) -> Self { - Self { - subgraphs_deployed: Arc::new(Mutex::new(HashSet::new())), - subgraph_metrics, - } - } - - fn insert(&self, id: DeploymentId) -> bool { - if !self.subgraphs_deployed.lock().unwrap().insert(id) { - return false; - } - - self.subgraph_metrics.deployment_count.inc(); - true - } - - fn remove(&self, id: &DeploymentId) -> bool { - if !self.subgraphs_deployed.lock().unwrap().remove(id) { - return false; - } - - self.subgraph_metrics.deployment_count.dec(); - true - } -} - -pub struct SubgraphAssignmentProvider { - logger_factory: LoggerFactory, - deployment_registry: DeploymentRegistry, - instance_manager: Arc, -} - -impl SubgraphAssignmentProvider { - pub fn new( - logger_factory: &LoggerFactory, - instance_manager: I, - subgraph_metrics: Arc, - ) -> Self { - let logger = logger_factory.component_logger("SubgraphAssignmentProvider", None); - let logger_factory = logger_factory.with_parent(logger.clone()); - - // Create the subgraph provider - SubgraphAssignmentProvider { - logger_factory, - instance_manager: Arc::new(instance_manager), - deployment_registry: DeploymentRegistry::new(subgraph_metrics), - } - } -} - -#[async_trait] -impl SubgraphAssignmentProviderTrait for SubgraphAssignmentProvider { - async fn start(&self, loc: DeploymentLocator, stop_block: Option) { - let logger = self.logger_factory.subgraph_logger(&loc); - - // If subgraph ID already in set - if !self.deployment_registry.insert(loc.id) { - info!(logger, "Subgraph deployment is already running"); - - return; - } - - let start_time = Instant::now(); - - self.instance_manager - .cheap_clone() - .start_subgraph(loc, stop_block) - .await; - - debug!( - logger, - "Subgraph started"; - "start_ms" => start_time.elapsed().as_millis() - ); - } - - async fn stop(&self, deployment: DeploymentLocator) { - // If subgraph ID was in set - if self.deployment_registry.remove(&deployment.id) { - // Shut down subgraph processing - self.instance_manager.stop_subgraph(deployment).await; - } - } -} diff --git a/core/src/subgraph/registrar.rs b/core/src/subgraph/registrar.rs index b05ccdf4e33..11892bcb53c 100644 --- a/core/src/subgraph/registrar.rs +++ b/core/src/subgraph/registrar.rs @@ -1,34 +1,30 @@ use std::collections::HashSet; use async_trait::async_trait; -use graph::blockchain::Blockchain; -use graph::blockchain::BlockchainKind; -use graph::blockchain::BlockchainMap; -use graph::components::link_resolver::LinkResolverContext; -use graph::components::store::{DeploymentId, DeploymentLocator, SubscriptionManager}; -use graph::components::subgraph::Settings; -use graph::data::subgraph::schema::DeploymentCreate; -use graph::data::subgraph::Graft; -use graph::data::value::Word; -use graph::futures03; -use graph::futures03::future::TryFutureExt; -use graph::futures03::Stream; -use graph::futures03::StreamExt; -use graph::prelude::{ - CreateSubgraphResult, SubgraphAssignmentProvider as SubgraphAssignmentProviderTrait, - SubgraphRegistrar as SubgraphRegistrarTrait, *, +use graph::amp; +use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; +use graph::components::{ + link_resolver::LinkResolverContext, + store::{DeploymentId, DeploymentLocator, SubscriptionManager}, + subgraph::Settings, }; +use graph::data::{ + subgraph::{schema::DeploymentCreate, Graft}, + value::Word, +}; +use graph::futures03::{self, future::TryFutureExt, Stream, StreamExt}; +use graph::prelude::{CreateSubgraphResult, SubgraphRegistrar as SubgraphRegistrarTrait, *}; use graph::tokio_retry::Retry; -use graph::util::futures::retry_strategy; -use graph::util::futures::RETRY_DEFAULT_LIMIT; +use graph::util::futures::{retry_strategy, RETRY_DEFAULT_LIMIT}; -pub struct SubgraphRegistrar { +pub struct SubgraphRegistrar { logger: Logger, logger_factory: LoggerFactory, resolver: Arc, provider: Arc

, store: Arc, subscription_manager: Arc, + amp_client: Option>, chains: Arc, node_id: NodeId, version_switching_mode: SubgraphVersionSwitchingMode, @@ -36,11 +32,12 @@ pub struct SubgraphRegistrar { settings: Arc, } -impl SubgraphRegistrar +impl SubgraphRegistrar where - P: SubgraphAssignmentProviderTrait, + P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, + AC: amp::Client + Send + Sync + 'static, { pub fn new( logger_factory: &LoggerFactory, @@ -48,6 +45,7 @@ where provider: Arc

, store: Arc, subscription_manager: Arc, + amp_client: Option>, chains: Arc, node_id: NodeId, version_switching_mode: SubgraphVersionSwitchingMode, @@ -65,6 +63,7 @@ where provider, store, subscription_manager, + amp_client, chains, node_id, version_switching_mode, @@ -160,11 +159,14 @@ where // Start subgraph on this node debug!(logger, "Deployment assignee is this node"; "assigned_to" => assigned, "action" => "add"); - self.provider.start(deployment, None).await; + self.provider + .cheap_clone() + .start_subgraph(deployment, None) + .await; } else { // Ensure it is removed from this node debug!(logger, "Deployment assignee is not this node"; "assigned_to" => assigned, "action" => "remove"); - self.provider.stop(deployment).await + self.provider.stop_subgraph(deployment).await } } else { // Was added/updated, but is now gone. @@ -172,10 +174,7 @@ where } } AssignmentOperation::Removed => { - // Send remove event without checking node ID. - // If node ID does not match, then this is a no-op when handled in - // assignment provider. - self.provider.stop(deployment).await; + self.provider.stop_subgraph(deployment).await; } } } @@ -210,7 +209,7 @@ where let provider = self.provider.cheap_clone(); graph::spawn(async move { - provider.start(id, None).await; + provider.start_subgraph(id, None).await; drop(sender) }); } @@ -223,11 +222,12 @@ where } #[async_trait] -impl SubgraphRegistrarTrait for SubgraphRegistrar +impl SubgraphRegistrarTrait for SubgraphRegistrar where - P: SubgraphAssignmentProviderTrait, + P: graph::components::subgraph::SubgraphInstanceManager, S: SubgraphStore, SM: SubscriptionManager, + AC: amp::Client + Send + Sync + 'static, { async fn create_subgraph( &self, @@ -299,7 +299,7 @@ where let deployment_locator = match kind { BlockchainKind::Ethereum => { - create_subgraph_version::( + create_subgraph_version::( &logger, self.store.clone(), self.chains.cheap_clone(), @@ -312,12 +312,13 @@ where debug_fork, self.version_switching_mode, &resolver, + self.amp_client.cheap_clone(), history_blocks, ) .await? } BlockchainKind::Near => { - create_subgraph_version::( + create_subgraph_version::( &logger, self.store.clone(), self.chains.cheap_clone(), @@ -330,12 +331,13 @@ where debug_fork, self.version_switching_mode, &resolver, + self.amp_client.cheap_clone(), history_blocks, ) .await? } BlockchainKind::Substreams => { - create_subgraph_version::( + create_subgraph_version::( &logger, self.store.clone(), self.chains.cheap_clone(), @@ -348,6 +350,7 @@ where debug_fork, self.version_switching_mode, &resolver, + self.amp_client.cheap_clone(), history_blocks, ) .await? @@ -461,7 +464,7 @@ async fn resolve_graft_block( }) } -async fn create_subgraph_version( +async fn create_subgraph_version( logger: &Logger, store: Arc, chains: Arc, @@ -474,6 +477,7 @@ async fn create_subgraph_version( debug_fork: Option, version_switching_mode: SubgraphVersionSwitchingMode, resolver: &Arc, + amp_client: Option>, history_blocks_override: Option, ) -> Result { let raw_string = serde_yaml::to_string(&raw).unwrap(); @@ -481,7 +485,8 @@ async fn create_subgraph_version( let unvalidated = UnvalidatedSubgraphManifest::::resolve( deployment.clone(), raw, - &resolver, + resolver, + amp_client, logger, ENV_VARS.max_spec_version.clone(), ) diff --git a/core/src/subgraph_provider.rs b/core/src/subgraph_provider.rs new file mode 100644 index 00000000000..65eeb7eb728 --- /dev/null +++ b/core/src/subgraph_provider.rs @@ -0,0 +1,365 @@ +use std::{collections::HashMap, sync::Arc, time::Instant}; + +use graph::{ + amp, + cheap_clone::CheapClone as _, + components::{ + link_resolver::{LinkResolver, LinkResolverContext}, + metrics::subgraph::SubgraphCountMetric, + store::DeploymentLocator, + subgraph::SubgraphInstanceManager, + }, + log::factory::LoggerFactory, +}; +use itertools::Itertools as _; +use parking_lot::RwLock; +use slog::{debug, error}; +use tokio_util::sync::CancellationToken; + +/// Starts and stops subgraph deployments. +/// +/// For each subgraph deployment, checks the subgraph processing kind +/// and finds the appropriate subgraph instance manager to handle the +/// processing of the subgraph deployment. +/// +/// This is required to support both trigger-based subgraphs and Amp-powered subgraphs, +/// which have separate runners. +pub struct SubgraphProvider { + logger_factory: LoggerFactory, + count_metrics: Arc, + link_resolver: Arc, + + /// Stops active subgraph start request tasks. + /// + /// When a subgraph deployment start request is processed, a background task is created + /// to load the subgraph manifest and determine the subgraph processing kind. The processing + /// kind is then used to find the appropriate subgraph instance manager. This token stops + /// all tasks that are still loading manifests or waiting for subgraphs to start. + cancel_token: CancellationToken, + + /// Contains the enabled subgraph instance managers. + /// + /// Only subgraphs for which there is an appropriate instance manager will be started. + instance_managers: SubgraphInstanceManagers, + + /// Maintains a list of started subgraphs with their processing kinds. + /// + /// Used to forward subgraph deployment stop requests to the appropriate subgraph instance manager. + assignments: SubgraphAssignments, +} + +impl SubgraphProvider { + /// Creates a new subgraph provider. + /// + /// # Arguments + /// - `logger_factory`: Creates loggers for each subgraph deployment start/stop request + /// - `count_metrics`: Tracks the number of started subgraph deployments + /// - `link_resolver`: Loads subgraph manifests to determine the subgraph processing kinds + /// - `cancel_token`: Stops active subgraph start request tasks + /// - `instance_managers`: Contains the enabled subgraph instance managers + pub fn new( + logger_factory: &LoggerFactory, + count_metrics: Arc, + link_resolver: Arc, + cancel_token: CancellationToken, + instance_managers: SubgraphInstanceManagers, + ) -> Self { + let logger = logger_factory.component_logger("SubgraphProvider", None); + let logger_factory = logger_factory.with_parent(logger.cheap_clone()); + + debug!(logger, "Creating subgraph provider"; + "enabled_subgraph_processing_kinds" => instance_managers.0.keys().join(", ") + ); + + Self { + logger_factory, + count_metrics, + link_resolver, + cancel_token, + instance_managers, + assignments: SubgraphAssignments::new(), + } + } + + /// Starts a subgraph deployment with the appropriate subgraph instance manager. + /// + /// Loads the subgraph manifest for the specified deployment locator, determines + /// the subgraph processing kind, finds the required instance manager, and forwards + /// the start request to that instance manager. Keeps the subgraph processing kind + /// in memory for handling the stop requests. + async fn assign_and_start_subgraph( + &self, + loc: DeploymentLocator, + stop_block: Option, + ) -> Result<(), Error> { + let logger = self.logger_factory.subgraph_logger(&loc); + + let link_resolver = self + .link_resolver + .for_manifest(&loc.hash.to_string()) + .map_err(|e| Error::CreateLinkResolver { + loc: loc.cheap_clone(), + source: e, + })?; + + let file_bytes = link_resolver + .cat( + &LinkResolverContext::new(&loc.hash, &logger), + &loc.hash.to_ipfs_link(), + ) + .await + .map_err(|e| Error::LoadManifest { + loc: loc.cheap_clone(), + source: e, + })?; + + let raw_manifest: serde_yaml::Mapping = + serde_yaml::from_slice(&file_bytes).map_err(|e| Error::ParseManifest { + loc: loc.cheap_clone(), + source: e, + })?; + + let subgraph_kind = SubgraphProcessingKind::from_manifest(&raw_manifest); + self.assignments.set_subgraph_kind(&loc, subgraph_kind); + + let Some(instance_manager) = self.instance_managers.get(&subgraph_kind) else { + return Err(Error::GetManager { loc, subgraph_kind }); + }; + + instance_manager.start_subgraph(loc, stop_block).await; + Ok(()) + } +} + +#[async_trait::async_trait] +impl SubgraphInstanceManager for SubgraphProvider { + async fn start_subgraph(self: Arc, loc: DeploymentLocator, stop_block: Option) { + let logger = self + .logger_factory + .subgraph_logger(&loc) + .new(slog::o!("method" => "start_subgraph")); + + if self.assignments.is_assigned(&loc) { + debug!(logger, "Subgraph is already started"); + return; + } + + self.count_metrics.deployment_count.inc(); + + let handle = tokio::spawn({ + let provider = self.cheap_clone(); + let loc = loc.cheap_clone(); + let start_instant = Instant::now(); + + async move { + debug!(logger, "Starting subgraph"); + + let fut = provider.assign_and_start_subgraph(loc, stop_block); + match provider.cancel_token.run_until_cancelled(fut).await { + Some(Ok(())) => { + debug!(logger, "Subgraph started"; + "duration_ms" => start_instant.elapsed().as_millis() + ); + } + Some(Err(e)) => { + error!(logger, "Subgraph failed to start"; + "e" => ?e + ); + } + None => { + debug!(logger, "Subgraph start cancelled"); + } + } + } + }); + + self.assignments.add( + loc, + SubgraphAssignment { + handle, + subgraph_kind: None, + }, + ) + } + + async fn stop_subgraph(&self, loc: DeploymentLocator) { + let logger = self + .logger_factory + .subgraph_logger(&loc) + .new(slog::o!("method" => "stop_subgraph")); + + debug!(logger, "Stopping subgraph"); + + let Some(SubgraphAssignment { + handle, + subgraph_kind, + }) = self.assignments.take(&loc) + else { + debug!(logger, "Subgraph is not started"); + return; + }; + + handle.abort(); + self.count_metrics.deployment_count.dec(); + + let Some(subgraph_kind) = subgraph_kind else { + debug!(logger, "Unknown subgraph kind"); + return; + }; + + let Some(instance_manager) = self.instance_managers.get(&subgraph_kind) else { + debug!(logger, "Missing instance manager"); + return; + }; + + instance_manager.stop_subgraph(loc).await; + debug!(logger, "Subgraph stopped"); + } +} + +/// Enumerates all possible errors of the subgraph provider. +#[derive(Debug, thiserror::Error)] +enum Error { + #[error("failed to create link resolver for '{loc}': {source:#}")] + CreateLinkResolver { + loc: DeploymentLocator, + source: anyhow::Error, + }, + + #[error("failed to load manifest for '{loc}': {source:#}")] + LoadManifest { + loc: DeploymentLocator, + source: anyhow::Error, + }, + + #[error("failed to parse manifest for '{loc}': {source:#}")] + ParseManifest { + loc: DeploymentLocator, + source: serde_yaml::Error, + }, + + #[error("failed to get instance manager for '{loc}' with kind '{subgraph_kind}'")] + GetManager { + loc: DeploymentLocator, + subgraph_kind: SubgraphProcessingKind, + }, +} + +/// Contains a mapping of enabled subgraph instance managers by subgraph processing kinds. +/// +/// Before starting a subgraph, its processing kind is determined from the subgraph manifest. +/// Then, the appropriate instance manager is loaded from this mapping. +pub struct SubgraphInstanceManagers( + HashMap>, +); + +impl SubgraphInstanceManagers { + /// Creates a new empty subgraph instance manager mapping. + pub fn new() -> Self { + Self(HashMap::new()) + } + + /// Adds a new subgraph instance manager for all subgraphs of the specified processing kind. + pub fn add( + &mut self, + subgraph_kind: SubgraphProcessingKind, + instance_manager: Arc, + ) { + self.0.insert(subgraph_kind, instance_manager); + } + + /// Returns the subgraph instance manager for the specified processing kind. + pub fn get( + &self, + subgraph_kind: &SubgraphProcessingKind, + ) -> Option> { + self.0 + .get(subgraph_kind) + .map(|instance_manager| instance_manager.cheap_clone()) + } +} + +/// Enumerates the supported subgraph processing kinds. +/// +/// Subgraphs may have different processing requirements, and this enum helps to map them +/// to the appropriate instance managers. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display)] +#[strum(serialize_all = "snake_case")] +pub enum SubgraphProcessingKind { + /// Represents trigger-based subgraphs. + Trigger, + + /// Represents Amp-powered subgraphs. + Amp, +} + +impl SubgraphProcessingKind { + /// Determines the subgraph processing kind from the subgraph manifest. + fn from_manifest(raw_manifest: &serde_yaml::Mapping) -> Self { + use serde_yaml::Value; + + let is_amp_manifest = raw_manifest + .get("dataSources") + .and_then(Value::as_sequence) + .and_then(|seq| { + seq.iter() + .filter_map(Value::as_mapping) + .filter_map(|map| map.get("kind")) + .filter_map(Value::as_str) + .filter(|kind| *kind == amp::manifest::DataSource::KIND) + .next() + }) + .is_some(); + + if is_amp_manifest { + return Self::Amp; + } + + Self::Trigger + } +} + +/// Maintains a list of started subgraph deployments with details required for stopping them. +struct SubgraphAssignments(RwLock>); + +impl SubgraphAssignments { + /// Creates a new empty list of started subgraph deployments. + fn new() -> Self { + Self(RwLock::new(HashMap::new())) + } + + /// Adds a new subgraph deployment to the list of started subgraph deployments. + fn add(&self, loc: DeploymentLocator, subgraph_assignment: SubgraphAssignment) { + self.0.write().insert(loc, subgraph_assignment); + } + + /// Updates the started subgraph deployment with the specified subgraph processing kind. + fn set_subgraph_kind(&self, loc: &DeploymentLocator, subgraph_kind: SubgraphProcessingKind) { + if let Some(subgraph_assignment) = self.0.write().get_mut(loc) { + subgraph_assignment.subgraph_kind = Some(subgraph_kind); + } + } + + /// Checks if the subgraph deployment is started. + fn is_assigned(&self, loc: &DeploymentLocator) -> bool { + self.0.read().contains_key(loc) + } + + /// Removes the subgraph deployment from the list of started subgraph deployments and returns its details. + fn take(&self, loc: &DeploymentLocator) -> Option { + self.0.write().remove(loc) + } +} + +/// Contains the details of a started subgraph deployment. +struct SubgraphAssignment { + /// The handle to the background task that starts this subgraph deployment. + handle: tokio::task::JoinHandle<()>, + + /// The subgraph processing kind of this subgraph deployment. + /// + /// Used to get the appropriate subgraph instance manager to forward the stop request to. + /// + /// Set to `None` until the subgraph manifest is loaded and parsed. + subgraph_kind: Option, +} diff --git a/docs/amp-powered-subgraphs.md b/docs/amp-powered-subgraphs.md new file mode 100644 index 00000000000..26255a14938 --- /dev/null +++ b/docs/amp-powered-subgraphs.md @@ -0,0 +1,407 @@ +# Amp-powered subgraphs + +> [!NOTE] +> This features is available starting from spec version `1.4.0` + +Amp-powered subgraphs are a new kind of subgraphs with SQL data sources that query and index data from the Amp servers. +They are significantly more efficient than the standard subgraphs, and the indexing time can be reduced from days and weeks, +to minutes and hours in most cases. + +## Prerequisites + +To enable Amp-powered subgraphs, the `GRAPH_AMP_FLIGHT_SERVICE_ADDRESS` ENV variable must be set to a valid Amp Flight gRPC service address. + +Additionally, if authentication is required for the Amp Flight gRPC service, the `GRAPH_AMP_FLIGHT_SERVICE_TOKEN` ENV variable must contain a valid authentication token. + +## Subgraph manifest + +Amp-powered subgraphs introduce a new structure for defining Amp subgraph data sources within the manifest. + +### Spec version + +The minimum spec version for Amp-powered subgraphs is `1.4.0`. + +

+Example YAML: + +```yaml +specVersion: 1.4.0 +# .. other fields ... +``` +
+ +### Data source structure + +### `kind` + +Every Amp data source must have the `kind` set to `amp`, and Amp-powered subgraphs must contain only Amp data sources. +This is used to assign the subgraph to the appropriate indexing process. + +
+Example YAML: + +```yaml +dataSources: + - kind: amp + # .. other fields ... +``` +
+ +### `name` + +Every Amp data source must have the `name` set to a non-empty string, containing only numbers, letters, hypens, or underscores. +This name is used for observability purposes and to identify progress and potential errors produced by the data source. + +
+Example YAML: + +```yaml +dataSources: + - name: Transfers + # .. other fields ... +``` +
+ +### `network` + +Every Amp data source must have the `network` field set to a valid network name. +This is used to validate that the SQL queries for this data source produce results for the expected network. + +> [!NOTE] +> Currently, the SQL queries are required to produce results for a single network in order to maintain compatibility with non-Amp subgraphs. + +
+Example YAML: + +```yaml +dataSources: + - network: ethereum-mainnet + # .. other fields ... +``` +
+ +### `source` + +Every Amp data source must have a valid `source` that describes the behavior of SQL queries from this data source. + +### `source.dataset` + +Contains the name of the dataset that can be queried by SQL queries in this data source. +This is used to validate that the SQL queries for this data source only query the expected dataset. + +
+Example YAML: + +```yaml +dataSources: + - source: + dataset: edgeandnode/ethereum_mainnet + # .. other fields ... +``` +
+ +### `source.tables` + +Contains the names of the tables that can be queried by SQL queries in this data source. +This is used to validate that the SQL queries for this data source only query the expected tables. + +
+Example YAML: + +```yaml +dataSources: + - source: + tables: + - blocks + - transactions + # .. other fields ... +``` +
+ +### `source.address` + +Contains the contract address with which SQL queries in the data source interact. + +Enables SQL query reuse through `sg_source_address()` calls instead of hard-coding the contract address. +SQL queries resolve `sg_source_address()` calls to this contract address. + +
+Example YAML: + +```yaml +dataSources: + - source: + address: "0xc944E90C64B2c07662A292be6244BDf05Cda44a7" + # .. other fields ... +``` +
+ +### `source.startBlock` + +Contains the minimum block number that SQL queries in the data source can query. +This is used as a starting point for the indexing process. + +_When not provided, defaults to block number `0`._ + +
+Example YAML: + +```yaml +dataSources: + - source: + startBlock: 11446769 + # .. other fields ... +``` +
+ +### `source.endBlock` + +Contains the maximum block number that SQL queries in the data source can query. +Reaching this block number will complete the indexing process. + +_When not provided, defaults to the maximum possible block number._ + +
+Example YAML: + +```yaml +dataSources: + - source: + endBlock: 23847939 + # .. other fields ... +``` +
+ +### `transformer` + +Every Amp data source must have a valid `transformer` that describes the transformations of source tables indexed by the Amp-powered subgraph. + +### `transformer.apiVersion` + +Represents the version of this transformer. Each version may contain a different set of features. + +> [!NOTE] +> Currently, only the version `0.0.1` is available. + +
+Example YAML: + +```yaml +dataSource: + - transformer: + apiVersion: 0.0.1 + # .. other fields ... +``` +
+ +### `transformer.abis` + +Contains a list of ABIs that SQL queries can reference to extract event signatures. + +Enables the use of `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` calls in the +SQL queries which are resolved to full event signatures based on this list. + +_When not provided, defaults to an empty list._ + +
+Example YAML: + +```yaml +dataSource: + - transformer: + abis: + - name: ERC721 # The name of the contract + file: + # .. other fields ... +``` +
+ +### `transformer.tables` + +Contains a list of transformed tables that extract data from source tables into subgraph entities. + +### Transformer table structure + +### `transformer.tables[i].name` + +Represents the name of the transformed table. Must reference a valid entity name from the subgraph schema. + +
+Example: + +**GraphQL schema:** + +```graphql +type Block @entity(immutable: true) { + # .. entity fields ... +} +``` + +**YAML manifest:** +```yaml +dataSource: + - transformer: + tables: + - name: Block + # .. other fields ... +``` +
+ +### `transformer.tables[i].query` + +Contains an inline SQL query that executes on the Amp server. +This is useful for simple SQL queries like `SELECT * FROM "edgeandnode/ethereum_mainnet".blocks;`. +For more complex cases, a separate file containing the SQL query can be used in the `file` field. + +The data resulting from this SQL query execution transforms into subgraph entities. + +_When not provided, the `file` field is used instead._ + +
+Example YAML: + +```yaml +dataSource: + - transformer: + tables: + - query: SELECT * FROM "edgeandnode/ethereum_mainnet".blocks; + # .. other fields ... +``` +
+ +### `transformer.tables[i].file` + +Contains the IPFS link to the SQL query that executes on the Amp server. + +The data resulting from this SQL query execution transforms into subgraph entities. + +_Ignored when the `query` field is provided._ +_When not provided, the `query` field is used instead._ + +
+Example YAML: + +```yaml +dataSource: + - transformer: + tables: + - file: + # .. other fields ... +``` +
+ +### Amp-powered subgraph examples + +Complete examples on how to create, deploy and query Amp-powered subgraphs are available in a separate repository: +https://github.com/edgeandnode/amp-subgraph-examples + +## SQL query requirements + +### Block numbers + +Every SQL query in Amp-powered subgraphs must return the block number for every row. +This is required because subgraphs rely on this information for storing subgraph entities. + +Graph-node will look for block numbers in the following columns: +`_block_num`, `block_num`, `blockNum`, `block`, `block_number`, `blockNumber`. + +Example SQL query: `SELECT _block_num, /* .. other projections .. */ FROM "edgeandnode/ethereum_mainnet".blocks;` + +### Block hashes + +Every SQL query in Amp-powered subgraphs is expected to return the block hash for every row. +This is required because subgraphs rely on this information for storing subgraph entities. + +When a SQL query does not have the block hash projection, graph-node will attempt to get it from the +source tables specified in the subgraph manifest. + +Graph-node will look for block hashes in the following columns: +`hash`, `block_hash`, `blockHash`. + +Example SQL query: `SELECT hash, /* .. other projections .. */ FROM "edgeandnode/ethereum_mainnet".blocks;` + +> [!NOTE] +> If a table does not contain the block hash column, it can be retrieved by joining that table with another that contains the column on the `_block_num` column. + +### Block timestamps + +Every SQL query in Amp-powered subgraphs is expected to return the block timestamps for every row. +This is required because subgraphs rely on this information for storing subgraph entities. + +When a SQL query does not have the block timestamps projection, graph-node will attempt to get it from the +source tables specified in the subgraph manifest. + +Graph-node will look for block timestamps in the following columns: +`timestamp`, `block_timestamp`, `blockTimestamp`. + +Example SQL query: `SELECT timestamp, /* .. other projections .. */ FROM "edgeandnode/ethereum_mainnet".blocks;` + +> [!NOTE] +> If a table does not contain the block timestamp column, it can be retrieved by joining that table with another that contains the column on the `_block_num` column. + +## Type conversions + +Amp core SQL data types are converted intuitively to compatible subgraph entity types. + +## Schema generation + +Amp-powered subgraphs support the generation of GraphQL schemas based on the schemas of SQL queries referenced in the subgraph manifest. +This is useful when indexing entities that do not rely on complex relationships, such as contract events. + +The generated subgraph entities are immutable. + +To enable schema generation, simply remove the `schema` field from the subgraph manifest. + +> [!NOTE] +> For more flexibility and control over the schema, a manually created GraphQL schema is preferred. + +## Aggregations + +Amp-powered subgraphs fully support the subgraph aggregations feature. +This allows having complex aggregations on top of data indexed from the Amp servers. + +For more information on using the powerful subgraph aggregations feature, +refer to the [documentation](https://github.com/graphprotocol/graph-node/blob/master/docs/aggregations.md). + +## Composition + +Amp-powered subgraphs fully support the subgraph composition feature. +This allows applying complex subgraph mappings on top of data indexed from the Amp servers. + +For more information on using the powerful subgraph composition feature, +refer to the [documentation](https://github.com/graphprotocol/example-composable-subgraph). + +## ENV variables + +Amp-powered subgraphs feature introduces the following new ENV variables: + +- `GRAPH_AMP_FLIGHT_SERVICE_ADDRESS` – The address of the Amp Flight gRPC service. _Defaults to `None`, which disables support for Amp-powered subgraphs._ +- `GRAPH_AMP_FLIGHT_SERVICE_TOKEN` – Token used to authenticate Amp Flight gRPC service requests. _Defaults to `None`, which disables authentication._ +- `GRAPH_AMP_MAX_BUFFER_SIZE` – Maximum number of response batches to buffer in memory per stream for each SQL query. _Defaults to `1,000`._ +- `GRAPH_AMP_MAX_BLOCK_RANGE` – Maximum number of blocks to request per stream for each SQL query. _Defaults to `2,000,000`._ +- `GRAPH_AMP_QUERY_RETRY_MIN_DELAY_SECONDS` – Minimum time to wait before retrying a failed SQL query to the Amp server. _Defaults to `1` second._ +- `GRAPH_AMP_QUERY_RETRY_MAX_DELAY_SECONDS` – Maximum time to wait before retrying a failed SQL query to the Amp server. _Defaults to `600` seconds._ + +## Metrics + +Amp-powered subgraphs feature introduces the following new metrics: + +- `amp_deployment_status` – Indicates the current indexing status of a deployment. + + **Possible values:** + - `1` - graph-node is preparing to start indexing; + - `2` - deployment is being indexed; + - `3` - indexing is stopped by request; + - `4` - indexing failed; +- `amp_deployment_head` – Tracks the most recent block number processed by a deployment. +- `amp_deployment_target` – Tracks the target block number of a deployment. +- `amp_deployment_synced` – Indicates whether a deployment has reached the chain head or the end block since it was deployed. + + **Possible values:** + - `0` - deployment is not synced; + - `1` - deployment is synced; +- `amp_deployment_indexing_duration_seconds` – Tracks the total duration in seconds of deployment indexing. +- `amp_deployment_blocks_processed_count` – Tracks the total number of blocks processed by a deployment. + + +Additionally, the `deployment_sync_secs` is extended with a new `amp-process` stage and new sections specific to the Amp indexing process. diff --git a/gnd/Cargo.toml b/gnd/Cargo.toml index 80966f9bfa4..0a1fa91aa18 100644 --- a/gnd/Cargo.toml +++ b/gnd/Cargo.toml @@ -20,6 +20,7 @@ env_logger = "0.11.8" git-testament = "0.2" lazy_static = "1.5.0" tokio = { workspace = true } +tokio-util.workspace = true serde = { workspace = true } # File watching @@ -29,4 +30,4 @@ pq-sys = { version = "0.7.2", features = ["bundled"] } openssl-sys = { version = "0.9.100", features = ["vendored"] } [target.'cfg(unix)'.dependencies] -pgtemp = { git = "https://github.com/graphprotocol/pgtemp", branch = "initdb-args" } \ No newline at end of file +pgtemp = { git = "https://github.com/graphprotocol/pgtemp", branch = "initdb-args" } diff --git a/gnd/src/main.rs b/gnd/src/main.rs index 4c34a59317e..0e8e42238a3 100644 --- a/gnd/src/main.rs +++ b/gnd/src/main.rs @@ -14,6 +14,7 @@ use graph::{ use graph_core::polling_monitor::ipfs_service; use graph_node::{launcher, opt::Opt}; use lazy_static::lazy_static; +use tokio_util::sync::CancellationToken; use gnd::watcher::{deploy_all_subgraphs, parse_manifest_args, watch_subgraphs}; @@ -159,6 +160,7 @@ async fn run_graph_node( opt: Opt, link_resolver: Arc, subgraph_updates_channel: mpsc::Receiver<(DeploymentHash, SubgraphName)>, + cancel_token: CancellationToken, ) -> Result<()> { let env_vars = Arc::new(EnvVars::from_env().context("Failed to load environment variables")?); @@ -184,6 +186,7 @@ async fn run_graph_node( Some(subgraph_updates_channel), prometheus_registry, metrics_registry, + cancel_token, ) .await; Ok(()) @@ -237,6 +240,7 @@ async fn main() -> Result<()> { let database_dir = Path::new(&dev_opt.database_dir); + let cancel_token = shutdown_token(); let logger = logger(true); info!(logger, "Starting Graph Node Dev 1"); @@ -256,7 +260,7 @@ async fn main() -> Result<()> { let logger_clone = logger.clone(); graph::spawn(async move { - let _ = run_graph_node(&logger_clone, opt, file_link_resolver, rx).await; + let _ = run_graph_node(&logger_clone, opt, file_link_resolver, rx, cancel_token).await; }); if let Err(e) = @@ -302,3 +306,39 @@ async fn main() -> Result<()> { #[allow(unreachable_code)] Ok(()) } + +fn shutdown_token() -> CancellationToken { + use tokio::signal; + + let cancel_token = CancellationToken::new(); + let cancel_token_clone = cancel_token.clone(); + + async fn shutdown_signal_handler() { + let ctrl_c = async { + signal::ctrl_c().await.unwrap(); + }; + + #[cfg(unix)] + let terminate = async { + signal::unix::signal(signal::unix::SignalKind::terminate()) + .unwrap() + .recv() + .await; + }; + + #[cfg(not(unix))] + let terminate = std::future::pending::<()>(); + + tokio::select! { + _ = ctrl_c => {}, + _ = terminate => {}, + }; + } + + tokio::spawn(async move { + shutdown_signal_handler().await; + cancel_token_clone.cancel(); + }); + + cancel_token +} diff --git a/graph/Cargo.toml b/graph/Cargo.toml index 44e004be00c..914986b3b8b 100644 --- a/graph/Cargo.toml +++ b/graph/Cargo.toml @@ -80,7 +80,7 @@ tokio-stream = { version = "0.1.15", features = ["sync"] } tokio-retry = "0.3.0" toml = "0.9.7" url = "2.5.7" -prometheus = "0.14.0" +prometheus.workspace = true priority-queue = "2.6.0" tonic = { workspace = true } prost = { workspace = true } @@ -103,6 +103,16 @@ serde_plain = "1.0.2" csv = "1.3.1" object_store = { version = "0.12.3", features = ["gcp"] } +# Dependencies related to Amp subgraphs +ahash.workspace = true +alloy.workspace = true +arrow-flight.workspace = true +arrow.workspace = true +half.workspace = true +lazy-regex.workspace = true +sqlparser-latest.workspace = true +tokio-util.workspace = true + [dev-dependencies] clap.workspace = true maplit = "1.0.2" diff --git a/graph/src/amp/client/flight_client.rs b/graph/src/amp/client/flight_client.rs new file mode 100644 index 00000000000..588f1a97762 --- /dev/null +++ b/graph/src/amp/client/flight_client.rs @@ -0,0 +1,383 @@ +use std::{collections::HashMap, ops::RangeInclusive, time::Duration}; + +use ahash::RandomState; +use alloy::primitives::{BlockHash, BlockNumber}; +use arrow::{datatypes::Schema, error::ArrowError}; +use arrow_flight::{ + decode::DecodedPayload, error::FlightError, flight_service_client::FlightServiceClient, + sql::client::FlightSqlServiceClient, +}; +use async_stream::try_stream; +use bytes::Bytes; +use futures03::{future::BoxFuture, stream::BoxStream, StreamExt}; +use http::Uri; +use serde::{Deserialize, Serialize}; +use slog::{debug, trace, Logger}; +use thiserror::Error; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; + +use crate::{ + amp::{ + client::{ + Client, LatestBlockBeforeReorg, RequestMetadata, ResponseBatch, ResumeStreamingQuery, + }, + error, + log::{one_line, Logger as _}, + }, + prelude::CheapClone, +}; + +/// A client for the Amp Flight gRPC service. +/// +/// This client connects to an Amp server and executes SQL queries +/// using the Apache Arrow Flight protocol. +pub struct FlightClient { + channel: Channel, + auth_token: Option, +} + +impl FlightClient { + /// Creates a new Amp client connected to the specified Amp Flight service address. + pub async fn new(addr: Uri) -> Result { + let is_https = addr.scheme() == Some(&http::uri::Scheme::HTTPS); + let mut endpoint = Endpoint::from(addr) + .tcp_keepalive(Some(Duration::from_secs(30))) + .keep_alive_while_idle(true) + .http2_adaptive_window(true) + .initial_connection_window_size(Some(32 * 1024 * 1024)) + .initial_stream_window_size(Some(16 * 1024 * 1024)) + .connect_timeout(Duration::from_secs(10)); + + if is_https { + let mut tls_config = ClientTlsConfig::new(); + tls_config = tls_config.with_native_roots(); + + endpoint = endpoint.tls_config(tls_config).unwrap(); + } + + Ok(Self { + channel: endpoint.connect().await.map_err(Error::Connection)?, + auth_token: None, + }) + } + + /// Sets the authentication token for requests to the Amp server. + pub fn set_auth_token(&mut self, auth_token: impl Into) { + self.auth_token = Some(auth_token.into()); + } + + fn raw_client(&self) -> FlightSqlServiceClient { + let channel = self.channel.cheap_clone(); + let client = FlightServiceClient::new(channel) + .max_encoding_message_size(256 * 1024 * 1024) + .max_decoding_message_size(256 * 1024 * 1024); + + let mut client = FlightSqlServiceClient::new_from_inner(client); + if let Some(auth_token) = &self.auth_token { + client.set_token(auth_token.clone()); + } + + client + } +} + +impl Client for FlightClient { + type Error = Error; + + fn schema( + &self, + logger: &Logger, + query: impl ToString, + ) -> BoxFuture<'static, Result> { + let logger = logger.component("AmpFlightClient"); + let mut raw_client = self.raw_client(); + let query = query.to_string(); + + Box::pin(async move { + const TXN_ID: Option = None; + + debug!(logger, "Executing SQL query"; + "query" => &*one_line(&query) + ); + + let flight_info = raw_client + .execute(query, TXN_ID) + .await + .map_err(Error::Service)?; + + flight_info.try_decode_schema().map_err(Error::Service) + }) + } + + fn query( + &self, + logger: &Logger, + query: impl ToString, + request_metadata: Option, + ) -> BoxStream<'static, Result> { + let query = query.to_string(); + + // Generates a hash from the SQL query for log correlation. + // The hash allows connecting related logs without including the full SQL query in every log message. + // Constant seeds ensure consistent hashes for the same query. + let hasher = RandomState::with_seeds(0, 0, 0, 0); + + let logger = logger + .component("AmpFlightClient") + .new(slog::o!("query_hash" => hasher.hash_one(&query))); + + let mut raw_client = self.raw_client(); + let mut prev_block_ranges: Vec = Vec::new(); + + if let Some(request_metadata) = request_metadata { + let RequestMetadata { + resume_streaming_query, + } = request_metadata; + + if let Some(resume_streaming_query) = resume_streaming_query { + prev_block_ranges = resume_streaming_query + .iter() + .cloned() + .map(Into::into) + .collect(); + + let metadata = serialize_resume_streaming_query(resume_streaming_query); + debug!(logger, "Setting request metadata"; + "amp-resume" => &metadata + ); + + raw_client.set_header("amp-resume", metadata.clone()); + + // TODO: Remove when the Amp server updates to the latest version + raw_client.set_header("nozzle-resume", metadata); + } + } + + try_stream! { + const TXN_ID: Option = None; + + debug!(logger, "Executing SQL query"; + "query" => &*one_line(&query) + ); + + let flight_info = raw_client + .execute(query, TXN_ID) + .await + .map_err(Error::Service)?; + + for (endpoint_index, endpoint) in flight_info.endpoint.into_iter().enumerate() { + let Some(ticket) = endpoint.ticket else { + continue; + }; + + let mut stream = raw_client.do_get(ticket).await.map_err(Error::Service)?.into_inner(); + let mut batch_index = 0u32; + let mut prev_block_ranges = prev_block_ranges.clone(); + + while let Some(batch_result) = stream.next().await { + let flight_data = batch_result.map_err(Error::Stream)?; + let app_metadata = flight_data.inner.app_metadata; + let payload = flight_data.payload; + + let record_batch = match payload { + DecodedPayload::None => { + trace!(logger, "Received empty data"; + "endpoint_index" => endpoint_index + ); + continue + }, + DecodedPayload::Schema(_) => { + trace!(logger, "Received schema only"; + "endpoint_index" => endpoint_index + ); + continue + } + DecodedPayload::RecordBatch(record_batch) => record_batch, + }; + let block_ranges = Metadata::parse(&app_metadata)?.ranges; + + trace!(logger, "Received a new record batch"; + "endpoint_index" => endpoint_index, + "batch_index" => batch_index, + "num_rows" => record_batch.num_rows(), + "memory_size_bytes" => record_batch.get_array_memory_size(), + "block_ranges" => ?block_ranges + ); + + if let Some(reorg) = detect_reorg(&block_ranges, &prev_block_ranges) { + yield ResponseBatch::Reorg(reorg); + } + + yield ResponseBatch::Batch { data: record_batch }; + + batch_index += 1; + prev_block_ranges = block_ranges; + } + + debug!(logger, "Query execution completed successfully"; + "batch_count" => batch_index + ); + } + } + .boxed() + } +} + +#[derive(Debug, Error)] +pub enum Error { + #[error("invalid metadata: {0:#}")] + InvalidMetadata(#[source] anyhow::Error), + + #[error("connection failed: {0:#}")] + Connection(#[source] tonic::transport::Error), + + #[error("service failed: {0:#}")] + Service(#[source] ArrowError), + + #[error("stream failed: {0:#}")] + Stream(#[source] FlightError), +} + +impl error::IsDeterministic for Error { + fn is_deterministic(&self) -> bool { + let msg = match self { + Self::InvalidMetadata(_) => return true, + Self::Connection(_) => return false, + Self::Service(e) => e.to_string(), + Self::Stream(_) => return false, + }; + + static DETERMINISTIC_ERROR_PATTERNS: &[&str] = &[ + // Example SQL query: SELECT; + r#"code: InvalidArgument, message: ""#, + // Example SQL query: SELECT * FROM invalid_dataset; + // SELECT * FROM valid_dataset.invalid_table; + r#"code: Internal, message: "error creating planning context: "#, + // Example SQL query: SELECT invalid_column FROM valid_dataset.valid_table; + r#"code: Internal, message: "planning error: "#, + ]; + + for &pattern in DETERMINISTIC_ERROR_PATTERNS { + if msg.contains(pattern) { + return true; + } + } + + false + } +} + +/// Metadata received with every record batch. +#[derive(Debug, Clone, Deserialize)] +struct Metadata { + /// Block ranges processed by the Amp server to produce the record batch. + ranges: Vec, +} + +impl Metadata { + /// Parses and returns the metadata. + fn parse(app_metadata: &[u8]) -> Result { + if app_metadata.is_empty() { + return Ok(Self { ranges: Vec::new() }); + } + + serde_json::from_slice::(app_metadata).map_err(|e| Error::InvalidMetadata(e.into())) + } +} + +/// Block range processed by the Amp server to produce a record batch. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +struct BlockRange { + /// Network that contains the source data for the dataset. + network: String, + + /// Block numbers processed. + numbers: RangeInclusive, + + /// Hash of the last block in the block range. + hash: BlockHash, + + /// Hash of the parent block of the first block in the block range. + prev_hash: Option, +} + +impl BlockRange { + /// Returns the first block number in the range. + fn start(&self) -> BlockNumber { + *self.numbers.start() + } + + /// Returns the last block number in the range. + fn end(&self) -> BlockNumber { + *self.numbers.end() + } +} + +impl From for BlockRange { + fn from(resume: ResumeStreamingQuery) -> Self { + Self { + network: resume.network, + numbers: resume.block_number..=resume.block_number, + hash: resume.block_hash, + prev_hash: None, + } + } +} + +/// Serializes the information required to resume a streaming SQL query to JSON. +fn serialize_resume_streaming_query(resume_streaming_query: Vec) -> String { + #[derive(Serialize)] + struct Block { + number: BlockNumber, + hash: BlockHash, + } + + let mapping: HashMap = resume_streaming_query + .into_iter() + .map( + |ResumeStreamingQuery { + network, + block_number: number, + block_hash: hash, + }| { (network, Block { number, hash }) }, + ) + .collect(); + + serde_json::to_string(&mapping).unwrap() +} + +/// Detects whether a reorg occurred during query execution. +/// +/// Compares current block ranges with block ranges from the previous record batch +/// to detect non-incremental batches. When a non-incremental batch is detected, +/// returns the block number and hash of the parent block of the first block +/// after reorg for every processed network. +/// +/// Returns `None` when no reorgs are detected. +fn detect_reorg( + block_ranges: &[BlockRange], + prev_block_ranges: &[BlockRange], +) -> Option> { + Some( + block_ranges + .iter() + .filter_map(|block_range| { + let prev_block_range = prev_block_ranges + .iter() + .find(|prev_block_range| prev_block_range.network == block_range.network)?; + + if block_range != prev_block_range && block_range.start() <= prev_block_range.end() + { + return Some(LatestBlockBeforeReorg { + network: block_range.network.clone(), + block_number: block_range.start().checked_sub(1), + block_hash: block_range.prev_hash, + }); + } + + None + }) + .collect::>(), + ) + .filter(|v| !v.is_empty()) +} diff --git a/graph/src/amp/client/mod.rs b/graph/src/amp/client/mod.rs new file mode 100644 index 00000000000..34999da03fa --- /dev/null +++ b/graph/src/amp/client/mod.rs @@ -0,0 +1,83 @@ +pub mod flight_client; + +use std::error::Error; + +use alloy::primitives::{BlockHash, BlockNumber}; +use arrow::{array::RecordBatch, datatypes::Schema}; +use futures03::{future::BoxFuture, stream::BoxStream}; +use slog::Logger; + +use crate::amp::error; + +/// Client for connecting to Amp core and executing SQL queries. +pub trait Client { + type Error: Error + error::IsDeterministic + Send + Sync + 'static; + + /// Executes a SQL query and returns the corresponding schema. + fn schema( + &self, + logger: &Logger, + query: impl ToString, + ) -> BoxFuture<'static, Result>; + + /// Executes a SQL query and streams the requested data in batches. + fn query( + &self, + logger: &Logger, + query: impl ToString, + request_metadata: Option, + ) -> BoxStream<'static, Result>; +} + +/// Metadata sent to the Amp server with the SQL query. +#[derive(Debug, Clone)] +pub struct RequestMetadata { + /// Allows resuming streaming SQL queries from any block. + pub resume_streaming_query: Option>, +} + +/// Resumes a streaming SQL query from the specified block. +#[derive(Debug, Clone)] +pub struct ResumeStreamingQuery { + /// Network that contains the source data for the dataset. + pub network: String, + + /// Block number after which the SQL query should resume. + /// + /// An invalid block number triggers a reorg message. + pub block_number: BlockNumber, + + /// Block hash of the block after which the SQL query should resume. + /// + /// An invalid block hash triggers a reorg message. + pub block_hash: BlockHash, +} + +/// Represents a batch response resulting from query execution on the Amp server. +#[derive(Debug, Clone)] +pub enum ResponseBatch { + /// Contains the batch data received from the Amp server. + Batch { data: RecordBatch }, + + /// Contains the reorg message received from the Amp server. + /// + /// It is received before the record batch that contains the data after the reorg. + Reorg(Vec), +} + +/// Represents the parent block of the first block after the reorg. +#[derive(Debug, Clone)] +pub struct LatestBlockBeforeReorg { + /// Network that contains the source data for the dataset. + pub network: String, + + /// Block number of the parent block of the first block after the reorg. + /// + /// It is `None` when the reorg affects every block in the blockchain. + pub block_number: Option, + + /// Block hash of the parent block of the first block after the reorg. + /// + /// It is `None` when the reorg affects every block in the blockchain. + pub block_hash: Option, +} diff --git a/graph/src/amp/codec/array_decoder.rs b/graph/src/amp/codec/array_decoder.rs new file mode 100644 index 00000000000..e74a777cb12 --- /dev/null +++ b/graph/src/amp/codec/array_decoder.rs @@ -0,0 +1,2084 @@ +use std::{fmt::Display, sync::LazyLock}; + +use alloy::primitives::B256; +use anyhow::{anyhow, Result}; +use arrow::{ + array::{ + timezone::Tz, Array, ArrayAccessor, BinaryArray, BinaryViewArray, BooleanArray, + Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + LargeStringArray, PrimitiveArray, StringArray, StringViewArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, + }, + datatypes::ArrowTemporalType, +}; +use chrono::{DateTime, Utc}; + +use super::decoder::Decoder; +use crate::data::store::scalar::{BigDecimal, BigInt}; + +/// Decodes Arrow arrays into subgraph types. +pub struct ArrayDecoder<'a, T: 'static>(&'a T); + +impl<'a, T> ArrayDecoder<'a, T> +where + T: Array + 'static, +{ + /// Creates a new Arrow array decoder. + /// + /// # Errors + /// + /// Returns an error if the `array` cannot be downcasted to type `T`. + /// + /// The returned error is deterministic. + pub fn new(array: &'a dyn Array) -> Result { + Ok(Self(downcast_ref(array)?)) + } +} + +macro_rules! check_value { + ($self:ident, $row_index:ident) => { + if $row_index >= $self.0.len() { + return Ok(None); + } + + if $self.0.is_null($row_index) { + return Ok(None); + } + }; +} + +impl<'a, T> ArrayDecoder<'a, T> +where + &'a T: ArrayAccessor, +{ + fn value( + &'a self, + row_index: usize, + mapping: impl FnOnce(<&'a T as ArrayAccessor>::Item) -> Result, + ) -> Result> { + check_value!(self, row_index); + mapping(self.0.value(row_index)).map(Some) + } +} + +impl Decoder> for ArrayDecoder<'_, BooleanArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, Ok) + } +} + +impl Decoder> for ArrayDecoder<'_, Int8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Int16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Int32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Int64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Int64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Int64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt8Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, Ok) + } +} + +impl Decoder> for ArrayDecoder<'_, UInt64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| decode_unsigned_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Float16Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| Ok(value.to_f32().into())) + } +} + +impl Decoder> for ArrayDecoder<'_, Float32Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| Ok(value.into())) + } +} + +impl Decoder> for ArrayDecoder<'_, Float64Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| Ok(value.into())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i32` from a decimal value")); + } + + self.value(row_index, decode_i32) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i64` from a decimal value")); + } + + self.value(row_index, decode_i64) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `BigInt` from a decimal value")); + } + + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal128Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| { + let scale = self.0.scale() as i64; + let big_int = decode_signed_big_int(x.to_le_bytes())?; + + Ok(BigDecimal::new(big_int, -scale)) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i32` from a decimal value")); + } + + self.value(row_index, |value| { + let value = value + .to_i128() + .ok_or_else(|| anyhow!("cannot decode `i32` from a larger `i256` value"))?; + + decode_i32(value) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `i64` from a decimal value")); + } + + self.value(row_index, |value| { + let value = value + .to_i128() + .ok_or_else(|| anyhow!("cannot decode `i64` from a larger `i256` value"))?; + + decode_i64(value) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + if self.0.scale() != 0 { + return Err(anyhow!("cannot decode `BigInt` from a decimal value")); + } + + self.value(row_index, |x| decode_signed_big_int(x.to_le_bytes())) + } +} + +impl Decoder> for ArrayDecoder<'_, Decimal256Array> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| { + let scale = self.0.scale() as i64; + let big_int = decode_signed_big_int(x.to_le_bytes())?; + + Ok(BigDecimal::new(big_int, -scale)) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, StringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(x.to_string())) + } +} + +impl Decoder> for ArrayDecoder<'_, StringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value + .parse() + .map_err(|_| anyhow!("failed to parse `BigInt` from a non-numeric string value")) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, StringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value.parse().map_err(|_| { + anyhow!("failed to parse `BigDecimal` from a non-numeric string value") + }) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, StringViewArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(x.to_string())) + } +} + +impl Decoder> for ArrayDecoder<'_, StringViewArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value + .parse() + .map_err(|_| anyhow!("failed to parse `BigInt` from a non-numeric string value")) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, StringViewArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value.parse().map_err(|_| { + anyhow!("failed to parse `BigDecimal` from a non-numeric string value") + }) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, LargeStringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| Ok(x.to_string())) + } +} + +impl Decoder> for ArrayDecoder<'_, LargeStringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value + .parse() + .map_err(|_| anyhow!("failed to parse `BigInt` from a non-numeric string value")) + }) + } +} + +impl Decoder> for ArrayDecoder<'_, LargeStringArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |value| { + value.parse().map_err(|_| { + anyhow!("failed to parse `BigDecimal` from a non-numeric string value") + }) + }) + } +} + +impl Decoder>> for ArrayDecoder<'_, BinaryArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder>> for ArrayDecoder<'_, BinaryViewArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder>> for ArrayDecoder<'_, FixedSizeBinaryArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder> for ArrayDecoder<'_, FixedSizeBinaryArray> { + fn decode(&self, row_index: usize) -> Result> { + self.value(row_index, |x| { + B256::try_from(x) + .map_err(|_| anyhow!("failed to convert '{}' to 'B256'", hex::encode(x))) + }) + } +} + +impl Decoder>> for ArrayDecoder<'_, LargeBinaryArray> { + fn decode(&self, row_index: usize) -> Result>> { + self.value(row_index, |x| Ok(x.into())) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampSecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampMillisecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampMicrosecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +impl Decoder>> for ArrayDecoder<'_, TimestampNanosecondArray> { + fn decode(&self, row_index: usize) -> Result>> { + check_value!(self, row_index); + decode_timestamp(self.0, row_index).map(Some) + } +} + +fn downcast_ref<'a, T>(array: &'a dyn Array) -> Result<&'a T> +where + T: Array + 'static, +{ + array + .as_any() + .downcast_ref() + .ok_or_else(|| anyhow!("failed to downcast array")) +} + +fn decode_i32(n: T) -> Result +where + T: TryInto + Copy + Display, +{ + n.try_into() + .map_err(|_| anyhow!("failed to convert '{n}' to 'i32'")) +} + +fn decode_i64(n: T) -> Result +where + T: TryInto + Copy + Display, +{ + n.try_into() + .map_err(|_| anyhow!("failed to convert '{n}' to 'i64'")) +} + +fn decode_signed_big_int(le_bytes: impl AsRef<[u8]>) -> Result { + let le_bytes = le_bytes.as_ref(); + + BigInt::from_signed_bytes_le(le_bytes) + .map_err(|_| anyhow!("failed to convert '{}' to 'BigInt'", hex::encode(le_bytes))) +} + +fn decode_unsigned_big_int(le_bytes: impl AsRef<[u8]>) -> Result { + let le_bytes = le_bytes.as_ref(); + + BigInt::from_unsigned_bytes_le(le_bytes) + .map_err(|_| anyhow!("failed to convert '{}' to 'BigInt'", hex::encode(le_bytes))) +} + +fn decode_timestamp(array: &PrimitiveArray, row_index: usize) -> Result> +where + T: ArrowTemporalType, + i64: From, +{ + static UTC: LazyLock = LazyLock::new(|| "+00:00".parse().unwrap()); + + let Some(timestamp) = array.value_as_datetime_with_tz(row_index, *UTC) else { + return Err(anyhow!("failed to decode timestamp; unknown timezone")); + }; + + Ok(timestamp.to_utc()) +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::i256; + use chrono::TimeZone; + use half::f16; + + use super::super::test_fixtures::*; + use super::*; + + mod boolean_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, BooleanArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("boolean").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(true)); + assert_eq!(decoder.decode(1).unwrap(), Some(false)); + assert_eq!(decoder.decode(2).unwrap(), Some(true)); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("boolean").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int8Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int8").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(i8::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i8::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i8::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int16_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int16Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int16").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(i16::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i16::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i16::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int16").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int32Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int32").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(i32::MAX)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i32::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i32::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int32").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int64_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Int64Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int64").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(i64::MAX)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i64::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("int64").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt8Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint8").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(u8::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u8::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u8::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint16_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt16Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint16").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + assert_eq!(decoder.decode(2).unwrap(), Some(u16::MAX as i32)); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u16::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u16::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint16").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt32Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint32").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u32::MAX as i64)); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u32::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint32").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod uint64_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, UInt64Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint64").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + } + + #[test] + fn fail_to_decode_i64_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(u64::MAX))); + } + + #[test] + fn decode_valid_u64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10u64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20u64)); + assert_eq!(decoder.decode(2).unwrap(), Some(u64::MAX)); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("uint64").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod float16_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Float16Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float16").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10.0))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20.0))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::from(f16::MAX.to_f32())) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float16").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod float32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Float32Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float32").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10.0))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20.0))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(f32::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float32").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod float64_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Float64Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float64").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10.0))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20.0))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(f64::MAX))); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("float64").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod decimal128_decoder_without_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal128Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("decimal128").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + } + + #[test] + fn fail_to_decode_i64_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(i128::MAX))); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::from(i128::MAX)) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("decimal128").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod decimal128_decoder_with_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal128Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("decimal128_with_scale") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn fail_to_decode_i32_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_i64_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_big_int_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(BigDecimal::new(10.into(), -10)) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(BigDecimal::new(20.into(), -10)) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::new(i128::MAX.into(), -10)) + ); + } + } + + mod decimal256_decoder_without_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal256Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("decimal256").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_i32_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i32)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i32)); + } + + #[test] + fn fail_to_decode_i32_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_i64_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(10i64)); + assert_eq!(decoder.decode(1).unwrap(), Some(20i64)); + } + + #[test] + fn fail_to_decode_i64_values_from_larger_values() { + let decoder = decoder::(); + + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigInt::from(20))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigInt::from_signed_bytes_be(&i256::MAX.to_be_bytes()).unwrap()) + ); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(BigDecimal::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Some(BigDecimal::from(20))); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::new( + BigInt::from_signed_bytes_be(&i256::MAX.to_be_bytes()).unwrap(), + 0 + )) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("decimal256").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod decimal256_decoder_with_scale { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, Decimal256Array>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("decimal256_with_scale") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn fail_to_decode_i32_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_i64_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_decode_big_int_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(BigDecimal::new(10.into(), -10)) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(BigDecimal::new(20.into(), -10)) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(BigDecimal::new( + BigInt::from_signed_bytes_be(&i256::MAX.to_be_bytes()).unwrap(), + -10 + )) + ); + } + } + + mod utf8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, StringArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("utf8").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_string_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Some("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Some("30".to_string())); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(30))); + } + + #[test] + fn fail_to_decode_big_int_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(30))); + } + + #[test] + fn fail_to_decode_big_decimal_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("utf8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod utf8_view_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, StringViewArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("utf8_view").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_string_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Some("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Some("30".to_string())); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(30))); + } + + #[test] + fn fail_to_decode_big_int_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(30))); + } + + #[test] + fn fail_to_decode_big_decimal_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("utf8_view").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod large_utf8_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, LargeStringArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("large_utf8").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_string_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Some("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Some("30".to_string())); + } + + #[test] + fn decode_valid_big_int_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigInt::from(30))); + } + + #[test] + fn fail_to_decode_big_int_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn decode_valid_big_decimal_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(2).unwrap(), Some(BigDecimal::from(30))); + } + + #[test] + fn fail_to_decode_big_decimal_values_from_non_numeric_values() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("large_utf8").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod binary_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, BinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new(RECORD_BATCH.column_by_name("binary").unwrap()) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("binary").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod binary_view_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, BinaryViewArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("binary_view").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("binary_view").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod fixed_size_binary_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, FixedSizeBinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("fixed_size_binary").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_decode_b256_values_from_invalid_binary_size() { + let decoder = decoder::(); + + decoder.decode(0).unwrap_err(); + decoder.decode(1).unwrap_err(); + decoder.decode(2).unwrap_err(); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("fixed_size_binary").unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod fixed_size_binary_32_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, FixedSizeBinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("fixed_size_binary_32").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_b256_values() { + let decoder = decoder::(); + + assert_eq!(decoder.decode(0).unwrap(), Some(B256::from([10u8; 32]))); + assert_eq!(decoder.decode(1).unwrap(), Some(B256::from([20u8; 32]))); + assert_eq!(decoder.decode(2).unwrap(), Some(B256::from([30u8; 32]))); + } + } + + mod large_binary_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, LargeBinaryArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("large_binary").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_binary_values() { + let decoder = decoder::>(); + + assert_eq!(decoder.decode(0).unwrap(), Some((b"aa".as_slice()).into())); + assert_eq!(decoder.decode(1).unwrap(), Some((b"bb".as_slice()).into())); + assert_eq!(decoder.decode(2).unwrap(), Some((b"cc".as_slice()).into())); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new(RECORD_BATCH.column_by_name("large_binary").unwrap()) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_second_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampSecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_second").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_second").unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_millisecond_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampMillisecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_millisecond") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_millisecond") + .unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_microsecond_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampMicrosecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_microsecond") + .unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH + .column_by_name("timestamp_microsecond") + .unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_nanosecond_decoder { + use super::*; + + fn decoder() -> Box>> + where + ArrayDecoder<'static, TimestampNanosecondArray>: Decoder>, + { + Box::new( + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_nanosecond").unwrap(), + ) + .unwrap(), + ) + } + + #[test] + fn decode_valid_values() { + let decoder = decoder::>(); + + assert_eq!( + decoder.decode(0).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Some(Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap()) + ); + } + + #[test] + fn handle_missing_values() { + let decoder = decoder::>(); + + assert!(decoder.decode(3).unwrap().is_none()); + assert!(decoder.decode(4).unwrap().is_none()); + } + + #[test] + fn fail_to_create_decoder_with_invalid_type() { + ArrayDecoder::::new( + RECORD_BATCH.column_by_name("timestamp_nanosecond").unwrap(), + ) + .map(|_| ()) + .unwrap_err(); + } + } +} diff --git a/graph/src/amp/codec/decoder.rs b/graph/src/amp/codec/decoder.rs new file mode 100644 index 00000000000..9b5c340e891 --- /dev/null +++ b/graph/src/amp/codec/decoder.rs @@ -0,0 +1,29 @@ +use anyhow::Result; + +/// Decodes Arrow data at specific row indices into subgraph types. +/// +/// This trait provides a common interface for converting Arrow format data into +/// custom types. Implementations handle the specifics of extracting data from +/// Arrow arrays and constructing the target type `T`. +pub trait Decoder { + /// Decodes and returns the value at the `row_index`. + /// + /// # Errors + /// + /// Returns an error if: + /// - The data cannot be converted to type `T` + /// - The underlying Arrow data is invalid or corrupted + /// + /// The returned error is deterministic. + fn decode(&self, row_index: usize) -> Result; +} + +/// Forwards decoding operations through boxed trait objects. +/// +/// This implementation enables using `Box>` as a decoder, +/// delegating to the underlying implementation. +impl Decoder for Box + '_> { + fn decode(&self, row_index: usize) -> Result { + (**self).decode(row_index) + } +} diff --git a/graph/src/amp/codec/list_decoder.rs b/graph/src/amp/codec/list_decoder.rs new file mode 100644 index 00000000000..4c0e2a44504 --- /dev/null +++ b/graph/src/amp/codec/list_decoder.rs @@ -0,0 +1,88 @@ +use anyhow::Result; + +use super::decoder::Decoder; + +/// Decodes Arrow lists to vectors of decoded values. +pub(super) struct ListDecoder<'a, T> { + decoder: T, + offsets: ArrayOffsets<'a>, +} + +/// Contains row index offsets used to determine how many values to decode from an Arrow list. +pub(super) enum ArrayOffsets<'a> { + Small(&'a [i32]), + Large(&'a [i64]), + Fixed(i32), +} + +impl<'a, T> ListDecoder<'a, T> { + /// Creates a new Arrow list decoder with provided `offsets`. + pub(super) fn new(decoder: T, offsets: ArrayOffsets<'a>) -> Self { + Self { decoder, offsets } + } +} + +impl<'a, T, V> Decoder>> for ListDecoder<'a, T> +where + T: Decoder, +{ + fn decode(&self, row_index: usize) -> Result>> { + let Some(range) = self.offsets.range(row_index) else { + return Ok(None); + }; + + let values = range + .map(|row_index| self.decoder.decode(row_index)) + .collect::, _>>()?; + + if values.is_empty() { + return Ok(None); + } + + Ok(Some(values)) + } +} + +impl<'a> ArrayOffsets<'a> { + /// Returns row indices belonging to a list at `row_index`. + fn range(&self, row_index: usize) -> Option> { + match self { + Self::Small(offsets) => { + let start = *offsets.get(row_index)? as usize; + let end = *offsets.get(row_index + 1)? as usize; + + Some(start..end) + } + Self::Large(offsets) => { + let start = *offsets.get(row_index)? as usize; + let end = *offsets.get(row_index + 1)? as usize; + + Some(start..end) + } + Self::Fixed(value_length) => { + let start = *value_length as usize * row_index; + let end = *value_length as usize * (row_index + 1); + + Some(start..end) + } + } + } +} + +impl<'a> From<&'a [i32]> for ArrayOffsets<'a> { + fn from(offsets: &'a [i32]) -> Self { + Self::Small(offsets) + } +} + +impl<'a> From<&'a [i64]> for ArrayOffsets<'a> { + fn from(offsets: &'a [i64]) -> Self { + Self::Large(offsets) + } +} + +impl From for ArrayOffsets<'static> { + fn from(value_length: i32) -> Self { + Self::Fixed(value_length) + } +} diff --git a/graph/src/amp/codec/mapping_decoder.rs b/graph/src/amp/codec/mapping_decoder.rs new file mode 100644 index 00000000000..b0c85e9d2e6 --- /dev/null +++ b/graph/src/amp/codec/mapping_decoder.rs @@ -0,0 +1,32 @@ +use anyhow::Result; + +use super::decoder::Decoder; + +/// Decodes Arrow arrays and maps the decoded values to a different type. +pub(super) struct MappingDecoder { + decoder: T, + mapping: Box V + 'static>, +} + +impl MappingDecoder { + /// Creates a new decoder that wraps the `decoder`. + /// + /// The `mapping` function transforms decoded values from type `U` to type `V`. + pub(super) fn new(decoder: T, mapping: impl Fn(U) -> V + 'static) -> Self { + Self { + decoder, + mapping: Box::new(mapping), + } + } +} + +impl Decoder for MappingDecoder +where + T: Decoder, +{ + fn decode(&self, row_index: usize) -> Result { + let value = self.decoder.decode(row_index)?; + + Ok((&self.mapping)(value)) + } +} diff --git a/graph/src/amp/codec/mod.rs b/graph/src/amp/codec/mod.rs new file mode 100644 index 00000000000..b642d0377c9 --- /dev/null +++ b/graph/src/amp/codec/mod.rs @@ -0,0 +1,511 @@ +mod array_decoder; +mod decoder; +mod list_decoder; +mod mapping_decoder; +mod name_cache; +mod value_decoder; + +#[cfg(test)] +mod test_fixtures; + +pub mod utils; + +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; + +use anyhow::{anyhow, bail, Context, Result}; +use arrow::array::{Array, RecordBatch}; + +use self::{list_decoder::ListDecoder, mapping_decoder::MappingDecoder, name_cache::NameCache}; +use crate::{ + data::{ + graphql::TypeExt, + store::{Id, IdType, Value}, + value::Word, + }, + schema::{EntityKey, EntityType, Field, InputSchema}, +}; + +pub use self::{array_decoder::ArrayDecoder, decoder::Decoder}; + +/// Handles decoding of record batches to subgraph entities. +pub struct Codec { + input_schema: InputSchema, + name_cache: NameCache, +} + +/// Contains the entities decoded from a record batch. +pub struct DecodeOutput { + /// The type of entities in this batch. + pub entity_type: EntityType, + + /// The type of the ID of entities in this batch. + pub id_type: IdType, + + /// A list of decoded entities of the same type. + pub decoded_entities: Vec, +} + +/// Contains a single entity decoded from a record batch. +pub struct DecodedEntity { + /// The unique ID of the entity. + /// + /// When set to `None`, the ID is expected to be auto-generated before a new entity is persisted. + pub key: Option, + + /// A list of entity field names and their values. + /// + /// This list could contain a subset of fields of an entity. + pub entity_data: Vec<(Word, Value)>, +} + +impl Codec { + /// Creates a new decoder for the `input_schema`. + pub fn new(input_schema: InputSchema) -> Self { + let name_cache = NameCache::new(); + + Self { + input_schema, + name_cache, + } + } + + /// Decodes a `record_batch` according to the schema of the entity with name `entity_name`. + /// + /// # Errors + /// + /// Returns an error if `record_batch` is not compatible with the schema of the entity with name `entity_name`. + /// + /// The returned error is deterministic. + pub fn decode(&mut self, record_batch: RecordBatch, entity_name: &str) -> Result { + let entity_type = self.entity_type(entity_name)?; + let id_type = entity_type.id_type()?; + let value_decoders = self.value_decoders(&entity_type, &record_batch)?; + let mut decoded_entities = Vec::with_capacity(record_batch.num_rows()); + + for i in 0..record_batch.num_rows() { + let err_ctx = |s: &str| format!("field '{s}' at row {i}"); + let mut entity_id: Option = None; + let mut entity_data = Vec::with_capacity(value_decoders.len()); + + for (&field_name, value_decoder) in &value_decoders { + let value = value_decoder + .decode(i) + .with_context(|| err_ctx(field_name))?; + + if field_name.eq_ignore_ascii_case("id") { + entity_id = Some(value.clone()); + } + + entity_data.push((Word::from(field_name), value)); + } + + let entity_key = entity_id + .map(Id::try_from) + .transpose() + .with_context(|| err_ctx("id"))? + .map(|entity_id| entity_type.key(entity_id)); + + decoded_entities.push(DecodedEntity { + key: entity_key, + entity_data, + }); + } + + drop(value_decoders); + + Ok(DecodeOutput { + entity_type, + id_type, + decoded_entities, + }) + } + + /// Returns the type of the entity with name `entity_name`. + /// + /// # Errors + /// + /// Returns an error if: + /// - There is no entity with name `entity_name` + /// - The entity is not an object + /// - The entity is a POI entity + /// + /// The returned error is deterministic. + fn entity_type(&self, entity_name: &str) -> Result { + let entity_type = self + .input_schema + .entity_type(entity_name) + .context("entity not found")?; + + if !entity_type.is_object_type() { + return Err(anyhow!("entity is not an object")); + } + + if entity_type.is_poi() { + return Err(anyhow!("entity is POI entity")); + } + + Ok(entity_type) + } + + /// Creates and returns value decoders for the fields of the entity with name `entity_name`. + /// + /// # Errors + /// + /// Returns an error if a decoder could not be created for a required field. + /// + /// The returned error is deterministic. + fn value_decoders<'a>( + &mut self, + entity_type: &'a EntityType, + record_batch: &'a RecordBatch, + ) -> Result + 'a>>> { + let object_type = entity_type.object_type().unwrap(); + let columns = record_batch + .schema_ref() + .fields() + .into_iter() + .zip(record_batch.columns()) + .map(|(field, array)| Ok((self.ident(field.name()), array.as_ref()))) + .collect::>>()?; + + let mut value_decoders = BTreeMap::new(); + for field in &object_type.fields { + let Some(value_decoder) = self.value_decoder(field, &columns)? else { + continue; + }; + + value_decoders.insert(field.name.as_str(), value_decoder); + } + + Ok(value_decoders) + } + + /// Creates and returns a value decoder for the `field`. + /// + /// Returns `None` when the `field` does not require a decoder. + /// This happens for derived fields, reserved fields, and when there is no associated + /// Arrow array for a nullable `field` or a `field` that could be auto-generated. + /// + /// # Errors + /// + /// Returns an error if: + /// - There is no associated Arrow array for a required `field` + /// - The `field` type is not compatible with the Arrow array + /// + /// The returned error is deterministic. + fn value_decoder<'a>( + &mut self, + field: &'a Field, + columns: &HashMap, &'a dyn Array>, + ) -> Result + 'a>>> { + // VIDs are auto-generated + if field.name.eq_ignore_ascii_case("vid") { + return Ok(None); + } + + // Derived fields are handled automatically + if field.is_derived() { + return Ok(None); + } + + let normalized_name = self.ident(&field.name); + let array = match columns.get(&normalized_name) { + Some(&array) => array, + None => { + // Allow ID auto-generation + if field.name.eq_ignore_ascii_case("id") { + return Ok(None); + } + + // Allow partial entities + if !field.field_type.is_non_null() { + return Ok(None); + } + + bail!("failed to get column for field '{}'", field.name); + } + }; + + let decoder = value_decoder::value_decoder(field.value_type, field.is_list(), array) + .with_context(|| format!("failed to create decoder for field '{}'", field.name))?; + + Ok(Some(decoder)) + } + + fn ident(&mut self, name: impl AsRef) -> Arc { + self.name_cache.ident(name.as_ref()) + } +} + +#[cfg(test)] +mod tests { + use std::sync::LazyLock; + + use arrow::array::{BinaryArray, BooleanArray, Int64Array, Int8Array}; + use arrow::datatypes::{DataType, Field, Schema}; + + use crate::data::subgraph::DeploymentHash; + + use super::*; + + static SCHEMA: LazyLock = LazyLock::new(|| { + InputSchema::parse_latest( + r#" + type Id @entity { + id: Int8! + } + + type BlockNumber @entity { + id: Int8! + blockNumber: BigInt! + } + + type OptionalBlockNumber @entity { + id: Int8! + blockNumber: BigInt + } + + type Block @entity { + id: Int8! + number: Int8! + hash: Bytes! + value: BigInt + } + "#, + DeploymentHash::default(), + ) + .unwrap() + }); + + #[inline] + fn new_codec() -> Codec { + Codec::new(SCHEMA.clone()) + } + + #[test] + fn fail_to_decode_unknown_entity() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Boolean, true)]); + let record_batch = RecordBatch::new_empty(schema.into()); + + let mut codec = new_codec(); + let e = codec + .decode(record_batch, "SomeEntity") + .map(|_| ()) + .unwrap_err(); + + assert!(format!("{e:#}").contains("entity not found")) + } + + #[test] + fn do_not_fail_on_empty_record_batch() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Boolean, true)]); + let record_batch = RecordBatch::new_empty(schema.into()); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Id").unwrap(); + + assert!(decode_output.decoded_entities.is_empty()); + } + + #[test] + fn allow_entity_ids_to_be_auto_generated() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Boolean, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(BooleanArray::from(vec![true, false]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Id").unwrap(); + let decoded_entities = decode_output.decoded_entities; + + assert_eq!(decoded_entities.len(), 2); + + for decoded_entity in decoded_entities { + assert!(decoded_entity.key.is_none()); + assert!(decoded_entity.entity_data.is_empty()); + } + } + + #[test] + fn decode_entity_ids() { + let schema = Schema::new(vec![Field::new("id", DataType::Int8, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(Int8Array::from(vec![10, 20, 30]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Id").unwrap(); + let decoded_entities = decode_output.decoded_entities; + + assert_eq!(decoded_entities.len(), 3); + + assert_eq!( + decoded_entities[0].key.as_ref().unwrap().entity_id, + Id::Int8(10), + ); + assert_eq!( + &decoded_entities[0].entity_data, + &[(Word::from("id"), Value::Int8(10))], + ); + + assert_eq!( + decoded_entities[1].key.as_ref().unwrap().entity_id, + Id::Int8(20) + ); + assert_eq!( + &decoded_entities[1].entity_data, + &[(Word::from("id"), Value::Int8(20))], + ); + + assert_eq!( + decoded_entities[2].key.as_ref().unwrap().entity_id, + Id::Int8(30) + ); + assert_eq!( + &decoded_entities[2].entity_data, + &[(Word::from("id"), Value::Int8(30))], + ); + } + + #[test] + fn fail_to_decode_entity_when_a_required_field_is_missing() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Int8, true)]); + let record_batch = + RecordBatch::try_new(schema.into(), vec![Arc::new(Int8Array::from(vec![10]))]).unwrap(); + + let mut codec = new_codec(); + let e = codec + .decode(record_batch, "BlockNumber") + .map(|_| ()) + .unwrap_err(); + + assert!(format!("{e:#}").contains("failed to get column for field 'blockNumber'")); + } + + #[test] + fn decode_entity_when_an_optional_field_is_missing() { + let schema = Schema::new(vec![Field::new("some_field", DataType::Int8, true)]); + let record_batch = + RecordBatch::try_new(schema.into(), vec![Arc::new(Int8Array::from(vec![10]))]).unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "OptionalBlockNumber").unwrap(); + let decoded_entitites = decode_output.decoded_entities; + + assert_eq!(decoded_entitites.len(), 1); + assert!(decoded_entitites[0].entity_data.is_empty()); + } + + #[test] + fn match_entity_field_name_with_column_name_ignoring_case() { + for column_name in [ + "block_number", + "Block_Number", + "BLOCK_NUMBER", + "blocknumber", + "blockNumber", + "BlockNumber", + "BLOCKNUMBER", + ] { + let schema = Schema::new(vec![Field::new(column_name, DataType::Int8, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(Int8Array::from(vec![10, 20, 30]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "BlockNumber").unwrap(); + let decoded_entitites = decode_output.decoded_entities; + + assert_eq!(decoded_entitites.len(), 3); + + assert_eq!( + &decoded_entitites[0].entity_data, + &[(Word::from("blockNumber"), Value::BigInt(10.into()))] + ); + assert_eq!( + &decoded_entitites[1].entity_data, + &[(Word::from("blockNumber"), Value::BigInt(20.into()))] + ); + assert_eq!( + &decoded_entitites[2].entity_data, + &[(Word::from("blockNumber"), Value::BigInt(30.into()))] + ); + } + } + + #[test] + fn fail_to_decode_entity_when_field_type_and_column_type_are_incompatible() { + let schema = Schema::new(vec![Field::new("block_number", DataType::Boolean, true)]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![Arc::new(BooleanArray::from(vec![true]))], + ) + .unwrap(); + + let mut codec = new_codec(); + let e = codec + .decode(record_batch, "BlockNumber") + .map(|_| ()) + .unwrap_err(); + + assert!(format!("{e:#}").contains("failed to create decoder for field 'blockNumber'")) + } + + #[test] + fn decode_entities_with_multiple_fields() { + let schema = Schema::new(vec![ + Field::new("number", DataType::Int8, true), + Field::new("hash", DataType::Binary, true), + Field::new("value", DataType::Int64, true), + ]); + let record_batch = RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(Int8Array::from(vec![10, 20, 30])), + Arc::new(BinaryArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + Arc::new(Int64Array::from(vec![100, 200, 300])), + ], + ) + .unwrap(); + + let mut codec = new_codec(); + let decode_output = codec.decode(record_batch, "Block").unwrap(); + let decoded_entitites = decode_output.decoded_entities; + + assert_eq!(decoded_entitites.len(), 3); + + assert_eq!( + &decoded_entitites[0].entity_data, + &[ + (Word::from("hash"), Value::Bytes(b"aa".as_ref().into())), + (Word::from("number"), Value::Int8(10)), + (Word::from("value"), Value::BigInt(100.into())) + ] + ); + assert_eq!( + &decoded_entitites[1].entity_data, + &[ + (Word::from("hash"), Value::Bytes(b"bb".as_ref().into())), + (Word::from("number"), Value::Int8(20)), + (Word::from("value"), Value::BigInt(200.into())) + ] + ); + assert_eq!( + &decoded_entitites[2].entity_data, + &[ + (Word::from("hash"), Value::Bytes(b"cc".as_ref().into())), + (Word::from("number"), Value::Int8(30)), + (Word::from("value"), Value::BigInt(300.into())) + ] + ); + } +} diff --git a/graph/src/amp/codec/name_cache.rs b/graph/src/amp/codec/name_cache.rs new file mode 100644 index 00000000000..9ad28f7a3b1 --- /dev/null +++ b/graph/src/amp/codec/name_cache.rs @@ -0,0 +1,34 @@ +use std::{collections::HashMap, sync::Arc}; + +use inflector::Inflector; + +use crate::cheap_clone::CheapClone; + +/// Normalizes and caches identifiers that are used to match Arrow columns and subgraph entity fields. +pub(super) struct NameCache { + cache: HashMap, Arc>, +} + +impl NameCache { + /// Creates a new empty cache. + pub(super) fn new() -> Self { + Self { + cache: HashMap::new(), + } + } + + /// Normalizes and returns the identifier for the given name. + /// + /// If the identifier exists in the cache, returns the cached version. + /// Otherwise, creates a new normalized identifier, caches it, and returns it. + pub(super) fn ident(&mut self, name: &str) -> Arc { + if let Some(ident) = self.cache.get(name) { + return ident.cheap_clone(); + } + + let ident: Arc = name.to_camel_case().to_lowercase().into(); + self.cache.insert(name.into(), ident.cheap_clone()); + + ident + } +} diff --git a/graph/src/amp/codec/test_fixtures.rs b/graph/src/amp/codec/test_fixtures.rs new file mode 100644 index 00000000000..a55001439b2 --- /dev/null +++ b/graph/src/amp/codec/test_fixtures.rs @@ -0,0 +1,364 @@ +use std::sync::{Arc, LazyLock}; + +use arrow::{ + array::{ + BinaryArray, BinaryViewArray, BooleanArray, BooleanBuilder, Decimal128Builder, + Decimal256Builder, FixedSizeBinaryArray, FixedSizeListBuilder, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + LargeListBuilder, LargeListViewBuilder, LargeStringArray, ListBuilder, ListViewBuilder, + RecordBatch, StringArray, StringViewArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, + }, + datatypes::{ + i256, DataType, Field, Schema, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, + }, +}; +use chrono::{TimeZone, Utc}; +use half::f16; + +pub static RECORD_BATCH: LazyLock = LazyLock::new(|| { + let record_batches = [ + &BOOLEAN_RECORD_BATCH, + &INT_RECORD_BATCH, + &UINT_RECORD_BATCH, + &DECIMAL_RECORD_BATCH, + &FLOAT_RECORD_BATCH, + &STRING_RECORD_BATCH, + &BINARY_RECORD_BATCH, + &TIMESTAMP_RECORD_BATCH, + ]; + + let schemas = record_batches + .iter() + .map(|record_batch| (*record_batch.schema()).clone()); + + let columns = record_batches + .into_iter() + .map(|record_batch| record_batch.columns()) + .flatten() + .map(|column| column.clone()) + .collect::>(); + + RecordBatch::try_new(Schema::try_merge(schemas).unwrap().into(), columns).unwrap() +}); + +pub static BOOLEAN_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("boolean", DataType::Boolean, true), + Field::new( + "boolean_list", + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "boolean_list_view", + DataType::ListView(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "boolean_fixed_size_list", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Boolean, true)), 3), + true, + ), + Field::new( + "boolean_large_list", + DataType::LargeList(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "boolean_large_list_view", + DataType::LargeListView(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + ]); + + let builder = || { + let mut builder = BooleanBuilder::new(); + builder.append_value(true); + builder.append_value(false); + builder.append_value(true); + builder + }; + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(BooleanArray::from(vec![true, false, true])), + Arc::new({ + let mut list_builder = ListBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = ListViewBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = FixedSizeListBuilder::new(builder(), 3); + list_builder.append(true); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.append(false); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.values().append_null(); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = LargeListBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + Arc::new({ + let mut list_builder = LargeListViewBuilder::new(builder()); + list_builder.append(true); + list_builder.append(false); + list_builder.append(false); + list_builder.finish() + }), + ], + ) + .unwrap() +}); + +pub static INT_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("int8", DataType::Int8, true), + Field::new("int16", DataType::Int16, true), + Field::new("int32", DataType::Int32, true), + Field::new("int64", DataType::Int64, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(Int8Array::from(vec![10, 20, i8::MAX])), + Arc::new(Int16Array::from(vec![10, 20, i16::MAX])), + Arc::new(Int32Array::from(vec![10, 20, i32::MAX])), + Arc::new(Int64Array::from(vec![10, 20, i64::MAX])), + ], + ) + .unwrap() +}); + +pub static UINT_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("uint8", DataType::UInt8, true), + Field::new("uint16", DataType::UInt16, true), + Field::new("uint32", DataType::UInt32, true), + Field::new("uint64", DataType::UInt64, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(UInt8Array::from(vec![10, 20, u8::MAX])), + Arc::new(UInt16Array::from(vec![10, 20, u16::MAX])), + Arc::new(UInt32Array::from(vec![10, 20, u32::MAX])), + Arc::new(UInt64Array::from(vec![10, 20, u64::MAX])), + ], + ) + .unwrap() +}); + +pub static DECIMAL_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new( + "decimal128", + DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0), + true, + ), + Field::new( + "decimal128_with_scale", + DataType::Decimal128(DECIMAL128_MAX_PRECISION, 10), + true, + ), + Field::new( + "decimal256", + DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0), + true, + ), + Field::new( + "decimal256_with_scale", + DataType::Decimal256(DECIMAL256_MAX_PRECISION, 10), + true, + ), + ]); + + let decimal_128_array = |scale: i8| { + let mut builder = Decimal128Builder::new() + .with_precision_and_scale(DECIMAL128_MAX_PRECISION, scale) + .unwrap(); + + builder.append_value(10); + builder.append_value(20); + builder.append_value(i128::MAX); + builder.finish() + }; + + let decimal_256_array = |scale: i8| { + let mut builder = Decimal256Builder::new() + .with_precision_and_scale(DECIMAL256_MAX_PRECISION, scale) + .unwrap(); + + builder.append_value(10.into()); + builder.append_value(20.into()); + builder.append_value(i256::MAX); + builder.finish() + }; + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(decimal_128_array(0)), + Arc::new(decimal_128_array(10)), + Arc::new(decimal_256_array(0)), + Arc::new(decimal_256_array(10)), + ], + ) + .unwrap() +}); + +pub static FLOAT_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("float16", DataType::Float16, true), + Field::new("float32", DataType::Float32, true), + Field::new("float64", DataType::Float64, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(Float16Array::from(vec![ + f16::from_f32(10.0), + f16::from_f32(20.0), + f16::MAX, + ])), + Arc::new(Float32Array::from(vec![10.0, 20.0, f32::MAX])), + Arc::new(Float64Array::from(vec![10.0, 20.0, f64::MAX])), + ], + ) + .unwrap() +}); + +pub static STRING_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("utf8", DataType::Utf8, true), + Field::new("utf8_view", DataType::Utf8View, true), + Field::new("large_utf8", DataType::LargeUtf8, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(StringArray::from(vec!["aa", "bb", "30"])), + Arc::new(StringViewArray::from(vec!["aa", "bb", "30"])), + Arc::new(LargeStringArray::from(vec!["aa", "bb", "30"])), + ], + ) + .unwrap() +}); + +pub static BINARY_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new("binary", DataType::Binary, true), + Field::new("binary_view", DataType::BinaryView, true), + Field::new("fixed_size_binary", DataType::FixedSizeBinary(2), true), + Field::new("fixed_size_binary_32", DataType::FixedSizeBinary(32), true), + Field::new("large_binary", DataType::LargeBinary, true), + ]); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(BinaryArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + Arc::new(BinaryViewArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + Arc::new(FixedSizeBinaryArray::from(vec![b"aa", b"bb", b"cc"])), + Arc::new(FixedSizeBinaryArray::from(vec![ + &[10; 32], &[20; 32], &[30; 32], + ])), + Arc::new(LargeBinaryArray::from(vec![b"aa".as_ref(), b"bb", b"cc"])), + ], + ) + .unwrap() +}); + +pub static TIMESTAMP_RECORD_BATCH: LazyLock = LazyLock::new(|| { + let schema = Schema::new(vec![ + Field::new( + "timestamp_second", + DataType::Timestamp(TimeUnit::Second, None), + true, + ), + Field::new( + "timestamp_millisecond", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new( + "timestamp_microsecond", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "timestamp_nanosecond", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + ]); + + let date_time_one = Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap(); + let date_time_two = Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10).unwrap(); + let date_time_three = Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59).unwrap(); + + RecordBatch::try_new( + schema.into(), + vec![ + Arc::new(TimestampSecondArray::from(vec![ + date_time_one.timestamp(), + date_time_two.timestamp(), + date_time_three.timestamp(), + ])), + Arc::new(TimestampMillisecondArray::from(vec![ + date_time_one.timestamp_millis(), + date_time_two.timestamp_millis(), + date_time_three.timestamp_millis(), + ])), + Arc::new(TimestampMicrosecondArray::from(vec![ + date_time_one.timestamp_micros(), + date_time_two.timestamp_micros(), + date_time_three.timestamp_micros(), + ])), + Arc::new(TimestampNanosecondArray::from(vec![ + date_time_one.timestamp_nanos_opt().unwrap(), + date_time_two.timestamp_nanos_opt().unwrap(), + date_time_three.timestamp_nanos_opt().unwrap(), + ])), + ], + ) + .unwrap() +}); + +#[test] +fn record_batch_is_valid() { + let _schema = BOOLEAN_RECORD_BATCH.schema(); + let _schema = INT_RECORD_BATCH.schema(); + let _schema = UINT_RECORD_BATCH.schema(); + let _schema = DECIMAL_RECORD_BATCH.schema(); + let _schema = FLOAT_RECORD_BATCH.schema(); + let _schema = STRING_RECORD_BATCH.schema(); + let _schema = BINARY_RECORD_BATCH.schema(); + let _schema = TIMESTAMP_RECORD_BATCH.schema(); + + let _schema = RECORD_BATCH.schema(); +} diff --git a/graph/src/amp/codec/utils.rs b/graph/src/amp/codec/utils.rs new file mode 100644 index 00000000000..4f6ba4ff0b1 --- /dev/null +++ b/graph/src/amp/codec/utils.rs @@ -0,0 +1,120 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{bail, Context, Result}; +use arrow::array::{ + Array, FixedSizeBinaryArray, RecordBatch, TimestampNanosecondArray, UInt64Array, +}; +use chrono::{DateTime, Utc}; + +use super::{ArrayDecoder, Decoder}; +use crate::amp::common::column_aliases; + +pub fn auto_block_number_decoder<'a>( + record_batch: &'a RecordBatch, +) -> Result<(&'static str, Box> + 'a>)> { + let (&column_name, column_index) = find_column(record_batch, column_aliases::BLOCK_NUMBER) + .with_context(|| { + format!( + "failed to find block numbers column; expected one of: {}", + column_aliases::BLOCK_NUMBER.join(", ") + ) + })?; + + block_number_decoder(record_batch, column_index) + .map(|decoder| (column_name, decoder)) + .with_context(|| format!("column '{column_name}' is not valid")) +} + +pub fn block_number_decoder<'a>( + record_batch: &'a RecordBatch, + column_index: usize, +) -> Result> + 'a>> { + column_decoder::(record_batch, column_index, false) +} + +pub fn auto_block_hash_decoder<'a>( + record_batch: &'a RecordBatch, +) -> Result<(&'static str, Box> + 'a>)> { + let (&column_name, column_index) = find_column(record_batch, column_aliases::BLOCK_HASH) + .with_context(|| { + format!( + "failed to find block hashes column; expected one of: {}", + column_aliases::BLOCK_HASH.join(", ") + ) + })?; + + block_hash_decoder(record_batch, column_index) + .map(|decoder| (column_name, decoder)) + .with_context(|| format!("column '{column_name}' is not valid")) +} + +pub fn block_hash_decoder<'a>( + record_batch: &'a RecordBatch, + column_index: usize, +) -> Result> + 'a>> { + column_decoder::(record_batch, column_index, false) +} + +pub fn auto_block_timestamp_decoder<'a>( + record_batch: &'a RecordBatch, +) -> Result<(&'static str, Box>> + 'a>)> { + let (&column_name, column_index) = find_column(record_batch, column_aliases::BLOCK_TIMESTAMP) + .with_context(|| { + format!( + "failed to find block timestamps column; expected one of: {}", + column_aliases::BLOCK_TIMESTAMP.join(", ") + ) + })?; + + block_timestamp_decoder(record_batch, column_index) + .map(|decoder| (column_name, decoder)) + .with_context(|| format!("column '{column_name}' is not valid")) +} + +pub fn block_timestamp_decoder<'a>( + record_batch: &'a RecordBatch, + column_index: usize, +) -> Result>> + 'a>> { + column_decoder::>(record_batch, column_index, false) +} + +pub fn find_column( + record_batch: &RecordBatch, + column_names: impl IntoIterator, +) -> Option<(T, usize)> +where + T: AsRef, +{ + let schema_ref = record_batch.schema_ref(); + + for column_name in column_names { + if let Some((column_index, _)) = schema_ref.column_with_name(column_name.as_ref()) { + return Some((column_name, column_index)); + } + } + + return None; +} + +pub fn column_decoder<'a, T: 'static, U>( + record_batch: &'a RecordBatch, + column_index: usize, + nullable: bool, +) -> Result> + 'a>> +where + T: Array, + ArrayDecoder<'a, T>: Decoder>, +{ + if column_index >= record_batch.num_columns() { + bail!("column does not exist"); + } + + let array = record_batch.column(column_index); + + if !nullable && array.is_nullable() { + bail!("column must not have nullable values"); + } + + let decoder = ArrayDecoder::::new(array)?; + + Ok(Box::new(decoder)) +} diff --git a/graph/src/amp/codec/value_decoder.rs b/graph/src/amp/codec/value_decoder.rs new file mode 100644 index 00000000000..c6e4e7162a2 --- /dev/null +++ b/graph/src/amp/codec/value_decoder.rs @@ -0,0 +1,873 @@ +use anyhow::{anyhow, Context, Result}; +use arrow::{ + array::{ + Array, BinaryArray, BinaryViewArray, BooleanArray, Decimal128Array, Decimal256Array, + FixedSizeBinaryArray, FixedSizeListArray, Float16Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeListArray, + LargeListViewArray, LargeStringArray, ListArray, ListViewArray, StringArray, + StringViewArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, + }, + datatypes::{DataType, TimeUnit}, +}; +use chrono::{DateTime, Utc}; + +use super::{ArrayDecoder, Decoder, ListDecoder, MappingDecoder}; +use crate::data::store::{ + scalar::{BigDecimal, BigInt, Bytes, Timestamp}, + Value, ValueType, +}; + +/// Returns a decoder that converts an Arrow array into subgraph store values. +/// +/// # Errors +/// +/// Returns an error if the subgraph store type is not compatible with the Arrow array type. +/// +/// The returned error is deterministic. +pub(super) fn value_decoder<'a>( + value_type: ValueType, + is_list: bool, + array: &'a dyn Array, +) -> Result + 'a>> { + let decoder = if is_list { + list_value_decoder(value_type, array) + } else { + single_value_decoder(value_type, array) + }; + + decoder.with_context(|| { + format!( + "failed to decode '{}' from '{}'", + value_type.to_str(), + array.data_type(), + ) + }) +} + +fn list_value_decoder<'a>( + value_type: ValueType, + array: &'a dyn Array, +) -> Result + 'a>> { + match array.data_type() { + DataType::List(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::ListView(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::FixedSizeList(_, _) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_length().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::LargeList(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + DataType::LargeListView(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + let decoder = single_value_decoder(value_type, list.values())?; + let list_decoder = ListDecoder::new(decoder, list.value_offsets().into()); + + Ok(mapping_decoder(list_decoder, Value::List)) + } + _ => { + let decoder = single_value_decoder(value_type, array)?; + + Ok(Box::new(MappingDecoder::new(decoder, |value| { + if matches!(value, Value::Null) { + return Value::Null; + } + + Value::List(vec![value]) + }))) + } + } +} + +fn single_value_decoder<'a>( + value_type: ValueType, + array: &'a dyn Array, +) -> Result + 'a>> { + let incompatible_types_err = || Err(anyhow!("incompatible types")); + + let decoder = match (value_type, array.data_type()) { + (ValueType::Boolean, DataType::Boolean) => { + let array_decoder = ArrayDecoder::::new(array)?; + mapping_decoder(array_decoder, Value::Bool) + } + (ValueType::Boolean, _) => return incompatible_types_err(), + + (ValueType::Int, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, Value::Int) + } + (ValueType::Int, _) => return incompatible_types_err(), + + (ValueType::Int8, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, Value::Int8) + } + (ValueType::Int8, _) => return incompatible_types_err(), + + (ValueType::BigInt, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, Value::BigInt) + } + (ValueType::BigInt, data_type) if is_string(data_type) => { + let string_decoder = string_decoder::>(array)?; + mapping_decoder(string_decoder, Value::BigInt) + } + (ValueType::BigInt, _) => return incompatible_types_err(), + + (ValueType::BigDecimal, data_type) if is_decimal(data_type) => { + let decimal_decoder = decimal_decoder::>(array)?; + mapping_decoder(decimal_decoder, Value::BigDecimal) + } + (ValueType::BigDecimal, data_type) if is_string(data_type) => { + let string_decoder = string_decoder::>(array)?; + mapping_decoder(string_decoder, Value::BigDecimal) + } + (ValueType::BigDecimal, _) => return incompatible_types_err(), + + (ValueType::Bytes, data_type) if is_binary(data_type) => { + let binary_decoder = binary_decoder::>>(array)?; + mapping_decoder(binary_decoder, |x| Bytes::from(&*x).into()) + } + (ValueType::Bytes, _) => return incompatible_types_err(), + + (ValueType::String, data_type) if is_string(data_type) => { + let string_decoder = string_decoder::>(array)?; + mapping_decoder(string_decoder, Value::String) + } + (ValueType::String, data_type) if is_integer(data_type) => { + let integer_decoder = integer_decoder::>(array)?; + mapping_decoder(integer_decoder, |x| x.to_string().into()) + } + (ValueType::String, data_type) if is_binary(data_type) => { + let binary_decoder = binary_decoder::>>(array)?; + mapping_decoder(binary_decoder, |x| format!("0x{}", hex::encode(x)).into()) + } + (ValueType::String, _) => return incompatible_types_err(), + + (ValueType::Timestamp, data_type) if is_timestamp(data_type) => { + let timestamp_decoder = timestamp_decoder::>>(array)?; + mapping_decoder(timestamp_decoder, |x| Timestamp(x).into()) + } + (ValueType::Timestamp, _) => return incompatible_types_err(), + }; + + Ok(decoder) +} + +fn mapping_decoder<'a, T, U: 'static>( + array_decoder: T, + mapping: fn(U) -> Value, +) -> Box + 'a> +where + T: Decoder> + 'a, +{ + Box::new(MappingDecoder::new( + array_decoder, + move |value: Option| match value { + Some(value) => mapping(value), + None => Value::Null, + }, + )) +} + +fn is_integer(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Int8 | Int16 | Int32 | Int64 | + UInt8 | UInt16 | UInt32 | UInt64 | + Decimal128(_, 0) | Decimal256(_, 0) + } +} + +fn integer_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, Int8Array>: Decoder, + ArrayDecoder<'a, Int16Array>: Decoder, + ArrayDecoder<'a, Int32Array>: Decoder, + ArrayDecoder<'a, Int64Array>: Decoder, + ArrayDecoder<'a, UInt8Array>: Decoder, + ArrayDecoder<'a, UInt16Array>: Decoder, + ArrayDecoder<'a, UInt32Array>: Decoder, + ArrayDecoder<'a, UInt64Array>: Decoder, + ArrayDecoder<'a, Decimal128Array>: Decoder, + ArrayDecoder<'a, Decimal256Array>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Int8 => Box::new(ArrayDecoder::::new(array)?), + Int16 => Box::new(ArrayDecoder::::new(array)?), + Int32 => Box::new(ArrayDecoder::::new(array)?), + Int64 => Box::new(ArrayDecoder::::new(array)?), + UInt8 => Box::new(ArrayDecoder::::new(array)?), + UInt16 => Box::new(ArrayDecoder::::new(array)?), + UInt32 => Box::new(ArrayDecoder::::new(array)?), + UInt64 => Box::new(ArrayDecoder::::new(array)?), + Decimal128(_, 0) => Box::new(ArrayDecoder::::new(array)?), + Decimal256(_, 0) => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported integer type")), + }; + + Ok(array_decoder) +} + +fn is_decimal(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Float16 | Float32 | Float64 | + Decimal128(_, _) | Decimal256(_, _) + } +} + +fn decimal_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, Float16Array>: Decoder, + ArrayDecoder<'a, Float32Array>: Decoder, + ArrayDecoder<'a, Float64Array>: Decoder, + ArrayDecoder<'a, Decimal128Array>: Decoder, + ArrayDecoder<'a, Decimal256Array>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Float16 => Box::new(ArrayDecoder::::new(array)?), + Float32 => Box::new(ArrayDecoder::::new(array)?), + Float64 => Box::new(ArrayDecoder::::new(array)?), + Decimal128(_, _) => Box::new(ArrayDecoder::::new(array)?), + Decimal256(_, _) => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported decimal type")), + }; + + Ok(array_decoder) +} + +fn is_binary(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Binary | BinaryView | FixedSizeBinary(_) | LargeBinary + } +} + +fn binary_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, BinaryArray>: Decoder, + ArrayDecoder<'a, BinaryViewArray>: Decoder, + ArrayDecoder<'a, FixedSizeBinaryArray>: Decoder, + ArrayDecoder<'a, LargeBinaryArray>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Binary => Box::new(ArrayDecoder::::new(array)?), + BinaryView => Box::new(ArrayDecoder::::new(array)?), + FixedSizeBinary(_) => Box::new(ArrayDecoder::::new(array)?), + LargeBinary => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported binary type")), + }; + + Ok(array_decoder) +} + +fn is_string(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Utf8 | Utf8View | LargeUtf8 + } +} + +fn string_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, StringArray>: Decoder, + ArrayDecoder<'a, StringViewArray>: Decoder, + ArrayDecoder<'a, LargeStringArray>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Utf8 => Box::new(ArrayDecoder::::new(array)?), + Utf8View => Box::new(ArrayDecoder::::new(array)?), + LargeUtf8 => Box::new(ArrayDecoder::::new(array)?), + data_type => return Err(anyhow!("'{data_type}' is not a supported string type")), + }; + + Ok(array_decoder) +} + +fn is_timestamp(data_type: &DataType) -> bool { + use DataType::*; + + matches! { + data_type, + Timestamp(TimeUnit::Second, _) | + Timestamp(TimeUnit::Millisecond, _) | + Timestamp(TimeUnit::Microsecond, _) | + Timestamp(TimeUnit::Nanosecond, _) + } +} + +fn timestamp_decoder<'a, T>(array: &'a dyn Array) -> Result + 'a>> +where + T: 'static, + ArrayDecoder<'a, TimestampSecondArray>: Decoder, + ArrayDecoder<'a, TimestampMillisecondArray>: Decoder, + ArrayDecoder<'a, TimestampMicrosecondArray>: Decoder, + ArrayDecoder<'a, TimestampNanosecondArray>: Decoder, +{ + use DataType::*; + + let array_decoder: Box> = match array.data_type() { + Timestamp(TimeUnit::Second, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + Timestamp(TimeUnit::Millisecond, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + Timestamp(TimeUnit::Microsecond, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + Timestamp(TimeUnit::Nanosecond, _) => { + Box::new(ArrayDecoder::::new(array)?) // + } + data_type => return Err(anyhow!("'{data_type}' is not a supported timestamp type")), + }; + + Ok(array_decoder) +} + +#[cfg(test)] +mod tests { + use super::super::test_fixtures::*; + use super::*; + + mod boolean_value_decoder { + use super::*; + + fn decoder(column_name: &str, is_list: bool) -> Box> { + value_decoder( + ValueType::Boolean, + is_list, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_single_values() { + let decoder = decoder("boolean", false); + + assert_eq!(decoder.decode(0).unwrap(), Value::Bool(true)); + assert_eq!(decoder.decode(1).unwrap(), Value::Bool(false)); + assert_eq!(decoder.decode(2).unwrap(), Value::Bool(true)); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + + #[test] + fn decode_single_values_as_lists() { + let decoder = decoder("boolean", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![Value::Bool(true)]) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::List(vec![Value::Bool(false)]) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::List(vec![Value::Bool(true)]) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + + #[test] + fn decode_list_values() { + let decoder = decoder("boolean_list", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn decode_list_view_values() { + let decoder = decoder("boolean_list_view", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn decode_fixed_size_list_values() { + let decoder = decoder("boolean_fixed_size_list", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + } + + #[test] + fn decode_large_list_values() { + let decoder = decoder("boolean_large_list", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn decode_large_list_view_values() { + let decoder = decoder("boolean_large_list_view", true); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::List(vec![ + Value::Bool(true), + Value::Bool(false), + Value::Bool(true), + ]) + ); + assert_eq!(decoder.decode(1).unwrap(), Value::Null); + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Boolean, false, BINARY_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Int, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::Int(10)); + assert_eq!(decoder.decode(1).unwrap(), Value::Int(20)); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Int, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod int8_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Int8, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::Int8(10)); + assert_eq!(decoder.decode(1).unwrap(), Value::Int8(20)); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Int8, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod big_int_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::BigInt, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::BigInt(BigInt::from(10))); + assert_eq!(decoder.decode(1).unwrap(), Value::BigInt(BigInt::from(20))); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(2).unwrap(), Value::BigInt(BigInt::from(30))); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_from_non_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + decoder.decode(0).unwrap_err(); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::BigInt, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod big_decimal_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::BigDecimal, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in ["float16", "float32", "float64", "decimal128", "decimal256"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::BigDecimal(BigDecimal::from(10.0)) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::BigDecimal(BigDecimal::from(20.0)) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(2).unwrap(), + Value::BigDecimal(BigDecimal::from(30.0)) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_from_non_numerical_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + decoder.decode(0).unwrap_err(); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::BigDecimal, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod bytes_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Bytes, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in ["binary", "binary_view", "fixed_size_binary", "large_binary"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::Bytes(b"aa".as_slice().into()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::Bytes(b"bb".as_slice().into()) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::Bytes(b"cc".as_slice().into()) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Bytes, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod string_value_decoder { + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::String, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values_from_strings() { + for column in ["utf8", "utf8_view", "large_utf8"] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::String("aa".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Value::String("bb".to_string())); + assert_eq!(decoder.decode(2).unwrap(), Value::String("30".to_string())); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_numbers() { + for column in [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "decimal128", + "decimal256", + ] { + let decoder = decoder(column); + + assert_eq!(decoder.decode(0).unwrap(), Value::String("10".to_string())); + assert_eq!(decoder.decode(1).unwrap(), Value::String("20".to_string())); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn decode_values_from_bytes() { + for column in ["binary", "binary_view", "fixed_size_binary", "large_binary"] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::String(format!("0x{}", hex::encode(b"aa"))) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::String(format!("0x{}", hex::encode(b"bb"))) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::String(format!("0x{}", hex::encode(b"cc"))) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::String, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } + + mod timestamp_value_decoder { + use chrono::{TimeZone, Utc}; + + use super::*; + + fn decoder(column_name: &str) -> Box> { + value_decoder( + ValueType::Timestamp, + false, + RECORD_BATCH.column_by_name(column_name).unwrap(), + ) + .unwrap() + } + + #[test] + fn decode_values() { + for column in [ + "timestamp_second", + "timestamp_millisecond", + "timestamp_microsecond", + "timestamp_nanosecond", + ] { + let decoder = decoder(column); + + assert_eq!( + decoder.decode(0).unwrap(), + Value::Timestamp(Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap().into()) + ); + assert_eq!( + decoder.decode(1).unwrap(), + Value::Timestamp( + Utc.with_ymd_and_hms(2020, 10, 10, 10, 10, 10) + .unwrap() + .into() + ) + ); + assert_eq!( + decoder.decode(2).unwrap(), + Value::Timestamp( + Utc.with_ymd_and_hms(2020, 12, 31, 23, 59, 59) + .unwrap() + .into() + ) + ); + assert_eq!(decoder.decode(3).unwrap(), Value::Null); + } + } + + #[test] + fn fail_to_decode_values_of_other_types() { + value_decoder(ValueType::Timestamp, false, BOOLEAN_RECORD_BATCH.column(0)) + .map(|_| ()) + .unwrap_err(); + } + } +} diff --git a/graph/src/amp/common/mod.rs b/graph/src/amp/common/mod.rs new file mode 100644 index 00000000000..d98fbea3b1b --- /dev/null +++ b/graph/src/amp/common/mod.rs @@ -0,0 +1,24 @@ +pub(super) mod column_aliases { + pub(in crate::amp) static BLOCK_NUMBER: &[&str] = &[ + "_block_num", // Meta column present in all tables + "block_num", // Standard column in most raw tables + "blockNum", // Common alternative name + "blocknum", // Common alternative name + "block", // Common alternative name + "block_number", // Common alternative name + "blockNumber", // Common alternative name + "blocknumber", // Common alternative name + ]; + pub(in crate::amp) static BLOCK_HASH: &[&str] = &[ + "hash", // Standard column in some raw tables + "block_hash", // Standard column in most raw tables and common alternative name + "blockHash", // Common alternative name + "blockhash", // Common alternative name + ]; + pub(in crate::amp) static BLOCK_TIMESTAMP: &[&str] = &[ + "timestamp", // Standard column in most raw tables + "block_timestamp", // Common alternative name + "blockTimestamp", // Common alternative name + "blocktimestamp", // Common alternative name + ]; +} diff --git a/graph/src/amp/error.rs b/graph/src/amp/error.rs new file mode 100644 index 00000000000..3489d7a94de --- /dev/null +++ b/graph/src/amp/error.rs @@ -0,0 +1,5 @@ +/// Checks whether errors are deterministic. +pub trait IsDeterministic { + /// Returns `true` if the error is deterministic. + fn is_deterministic(&self) -> bool; +} diff --git a/graph/src/amp/log.rs b/graph/src/amp/log.rs new file mode 100644 index 00000000000..e11c129b6b7 --- /dev/null +++ b/graph/src/amp/log.rs @@ -0,0 +1,20 @@ +use std::borrow::Cow; + +use lazy_regex::regex_replace_all; + +/// Extends the [slog::Logger] with methods commonly used in Amp modules +pub trait Logger { + /// Creates a new child logger scoped to a specific component + fn component(&self, name: &'static str) -> slog::Logger; +} + +impl Logger for slog::Logger { + fn component(&self, name: &'static str) -> slog::Logger { + self.new(slog::o!("component" => name)) + } +} + +/// Removes newlines and extra spaces from a string +pub fn one_line<'a>(s: &'a str) -> Cow<'a, str> { + regex_replace_all!(r"(\\r)?(\\n)?\s+", s, " ") +} diff --git a/graph/src/amp/manifest/data_source/mod.rs b/graph/src/amp/manifest/data_source/mod.rs new file mode 100644 index 00000000000..eb0575fdbf4 --- /dev/null +++ b/graph/src/amp/manifest/data_source/mod.rs @@ -0,0 +1,112 @@ +pub mod raw; + +use alloy::{ + json_abi::JsonAbi, + primitives::{Address, BlockNumber}, +}; +use arrow::datatypes::Schema; +use semver::Version; + +use crate::{amp::sql::BlockRangeQueryBuilder, data::subgraph::SPEC_VERSION_1_5_0}; + +pub use self::raw::RawDataSource; + +/// Represents a valid data source of an Amp subgraph. +/// +/// This data source contains parsed, formatted, and resolved data. +#[derive(Debug, Clone)] +pub struct DataSource { + /// The name of the data source. + /// + /// Used for observability to identify progress and errors produced by this data source. + pub name: String, + + /// The network name of the data source. + pub network: String, + + /// Contains the sources used by this data source. + pub source: Source, + + /// Contains the transformations of source tables indexed by the subgraph. + pub transformer: Transformer, +} + +impl DataSource { + pub const KIND: &str = "amp"; + pub const MIN_SPEC_VERSION: Version = SPEC_VERSION_1_5_0; +} + +/// Contains the sources that a data source uses. +#[derive(Debug, Clone)] +pub struct Source { + /// The dataset from which SQL queries in the data source can query. + pub dataset: String, + + /// The tables from which SQL queries in the data source can query. + pub tables: Vec, + + /// The contract address with which SQL queries in the data source interact. + /// + /// This address enables SQL query reuse through `sg_source_address()` calls instead of hard-coding the contract address. + /// The `sg_source_address()` calls in SQL queries of the data source resolve to this contract address. + /// + /// SQL queries are not limited to using only this contract address. + /// + /// Defaults to an empty contract address. + pub address: Address, + + /// The minimum block number that SQL queries in the data source can query. + /// + /// Defaults to the minimum possible block number. + pub start_block: BlockNumber, + + /// The maximum block number that SQL queries in the data source can query. + /// + /// Defaults to the maximum possible block number. + pub end_block: BlockNumber, +} + +/// Contains the transformations of source tables indexed by the subgraph. +#[derive(Debug, Clone)] +pub struct Transformer { + /// The version of this transformer. + pub api_version: Version, + + /// The ABIs that SQL queries can reference to extract event signatures. + /// + /// The `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` calls in the + /// SQL queries resolve to a full event signature based on this list. + pub abis: Vec, + + /// The transformed tables that extract data from source tables for indexing. + pub tables: Vec, +} + +/// Represents an ABI of a smart contract. +#[derive(Debug, Clone)] +pub struct Abi { + /// The name of the contract. + pub name: String, + + /// The JSON ABI of the contract. + pub contract: JsonAbi, +} + +/// Represents a transformed table that extracts data from source tables for indexing. +#[derive(Debug, Clone)] +pub struct Table { + /// The name of the transformed table. + /// + /// Must reference a valid entity name from the subgraph schema. + pub name: String, + + /// The SQL query that executes on the Amp server. + /// + /// The data resulting from this SQL query execution transforms into subgraph entities. + pub query: BlockRangeQueryBuilder, + + /// The Arrow schema of this transformed table SQL query. + /// + /// This schema loads from the Amp server. + pub schema: Schema, +} diff --git a/graph/src/amp/manifest/data_source/raw.rs b/graph/src/amp/manifest/data_source/raw.rs new file mode 100644 index 00000000000..4f4d8cef71d --- /dev/null +++ b/graph/src/amp/manifest/data_source/raw.rs @@ -0,0 +1,695 @@ +use std::{collections::HashSet, sync::LazyLock}; + +use alloy::{ + json_abi::JsonAbi, + primitives::{Address, BlockNumber}, +}; +use anyhow::anyhow; +use arrow::{array::RecordBatch, datatypes::Schema}; +use futures03::future::try_join_all; +use lazy_regex::regex_is_match; +use semver::Version; +use serde::Deserialize; +use slog::{debug, error, Logger}; +use thiserror::Error; + +use super::{Abi, DataSource, Source, Table, Transformer}; +use crate::{ + amp::{ + self, + codec::utils::{ + auto_block_hash_decoder, auto_block_number_decoder, auto_block_timestamp_decoder, + }, + error::IsDeterministic, + sql::{BlockRangeQueryBuilder, ContextQuery, ValidQuery}, + }, + components::link_resolver::{LinkResolver, LinkResolverContext}, + data::subgraph::DeploymentHash, +}; + +/// Supported API versions for data source transformers. +static API_VERSIONS: LazyLock> = + LazyLock::new(|| HashSet::from([Version::new(0, 0, 1)])); + +/// Represents an unmodified input data source of an Amp subgraph. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawDataSource { + /// The name of the data source. + /// + /// Must be a valid, non-empty identifier with no spaces or special characters. + pub name: String, + + /// The kind of the data source. + /// + /// Must be equal to `amp`. + pub kind: String, + + /// The network name of the data source. + pub network: String, + + /// Contains sources used by this data source. + pub source: RawSource, + + /// Contains transformations of source tables indexed by the subgraph. + pub transformer: RawTransformer, +} + +impl RawDataSource { + /// Parses, formats, and resolves the input data source into a valid data source. + pub async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + amp_client: &impl amp::Client, + ) -> Result { + let Self { + name, + kind, + network, + source, + transformer, + } = self; + + let logger = logger.new(slog::o!("data_source" => name.clone())); + debug!(logger, "Resolving data source"); + + validate_ident(&name).map_err(|e| e.source_context("invalid `name`"))?; + Self::validate_kind(kind)?; + + let source = source + .resolve() + .map_err(|e| e.source_context("invalid `source`"))?; + + let transformer = transformer + .resolve(&logger, link_resolver, amp_client, &network, &source) + .await + .map_err(|e| e.source_context("invalid `transformer`"))?; + + Ok(DataSource { + name, + network, + source, + transformer, + }) + } + + fn validate_kind(kind: String) -> Result<(), Error> { + if !kind.eq_ignore_ascii_case(DataSource::KIND) { + return Err(Error::InvalidValue(anyhow!("invalid `kind`"))); + } + + Ok(()) + } +} + +/// Contains an unmodified input source used by the data source. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawSource { + /// The dataset that SQL queries in the data source can query. + /// + /// Must reference a valid dataset name from the Amp server. + pub dataset: String, + + /// The tables that SQL queries in the data source can query. + /// + /// Must reference valid table names of the dataset from the Amp server. + pub tables: Vec, + + /// The contract address used by SQL queries in the data source. + /// + /// Enables SQL query reuse through `sg_source_address()` calls instead of hard-coding the contract address. + /// SQL queries resolve `sg_source_address()` calls to this contract address. + pub address: Option
, + + /// The minimum block number that SQL queries in the data source can query. + pub start_block: Option, + + /// The maximum block number that SQL queries in the data source can query. + pub end_block: Option, +} + +impl RawSource { + /// Parses, formats, and resolves the input source into a valid source. + fn resolve(self) -> Result { + let Self { + dataset, + tables, + address, + start_block, + end_block, + } = self; + + if dataset.is_empty() { + return Err(Error::InvalidValue(anyhow!("`dataset` cannot be empty"))); + } + Self::validate_tables(&tables)?; + + let address = address.unwrap_or(Address::ZERO); + let start_block = start_block.unwrap_or(BlockNumber::MIN); + let end_block = end_block.unwrap_or(BlockNumber::MAX); + + if start_block >= end_block { + return Err(Error::InvalidValue(anyhow!( + "`end_block` must be greater than `start_block`" + ))); + } + + Ok(Source { + dataset, + tables, + address, + start_block, + end_block, + }) + } + + fn validate_tables(tables: &[String]) -> Result<(), Error> { + const MAX_TABLES: usize = 100; + + if tables.is_empty() { + return Err(Error::InvalidValue(anyhow!("`tables` cannot be empty"))); + } + + if tables.len() > MAX_TABLES { + return Err(Error::InvalidValue(anyhow!( + "`tables` cannot have more than {MAX_TABLES} tables" + ))); + } + + for (i, table) in tables.iter().enumerate() { + if table.is_empty() { + return Err(Error::InvalidValue(anyhow!( + "`table` at index {i} cannot be empty" + ))); + } + } + + Ok(()) + } +} + +/// Contains unmodified input transformations of source tables indexed by the subgraph. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawTransformer { + /// The version of this transformer. + /// + /// Must be a supported API version of the Amp subgraph transformers API. + pub api_version: Version, + + /// The ABIs that SQL queries can reference to extract event signatures. + /// + /// SQL queries resolve `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` calls + /// to full event signatures based on this list. + pub abis: Option>, + + /// The transformed tables that extract data from source tables for indexing. + pub tables: Vec, +} + +impl RawTransformer { + /// Parses, formats, and resolves the input transformer into a valid transformer. + async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + amp_client: &impl amp::Client, + network: &str, + source: &Source, + ) -> Result { + let Self { + api_version, + abis, + tables, + } = self; + Self::validate_api_version(&api_version)?; + + let abis = Self::resolve_abis(logger, link_resolver, abis).await?; + let tables = Self::resolve_tables( + logger, + link_resolver, + amp_client, + network, + tables, + source, + &abis, + ) + .await?; + + Ok(Transformer { + api_version, + abis, + tables, + }) + } + + fn validate_api_version(api_version: &Version) -> Result<(), Error> { + if !API_VERSIONS.contains(api_version) { + return Err(Error::InvalidValue(anyhow!("invalid `api_version`"))); + } + + Ok(()) + } + + async fn resolve_abis( + logger: &Logger, + link_resolver: &dyn LinkResolver, + abis: Option>, + ) -> Result, Error> { + const MAX_ABIS: usize = 100; + + let Some(abis) = abis else { + return Ok(Vec::new()); + }; + + if abis.len() > MAX_ABIS { + return Err(Error::InvalidValue(anyhow!( + "`abis` cannot have more than {MAX_ABIS} ABIs" + ))); + } + + let abi_futs = abis.into_iter().enumerate().map(|(i, abi)| async move { + let logger = logger.new(slog::o!("abi_name" => abi.name.clone())); + debug!(logger, "Resolving ABI"; + "file" => &abi.file, + ); + + abi.resolve(&logger, link_resolver) + .await + .map_err(|e| e.source_context(format!("invalid `abis` at index {i}"))) + }); + + try_join_all(abi_futs).await + } + + async fn resolve_tables( + logger: &Logger, + link_resolver: &dyn LinkResolver, + amp_client: &impl amp::Client, + network: &str, + tables: Vec, + source: &Source, + abis: &[Abi], + ) -> Result, Error> { + const MAX_TABLES: usize = 100; + + if tables.is_empty() { + return Err(Error::InvalidValue(anyhow!("`tables` cannot be empty"))); + } + + if tables.len() > MAX_TABLES { + return Err(Error::InvalidValue(anyhow!( + "`tables` cannot have more than {MAX_TABLES} tables" + ))); + } + + let table_futs = tables.into_iter().enumerate().map(|(i, table)| async move { + let logger = logger.new(slog::o!("table_name" => table.name.clone())); + debug!(logger, "Resolving table"; + "file" => ?&table.file + ); + + table + .resolve(&logger, link_resolver, amp_client, network, source, abis) + .await + .map_err(|e| e.source_context(format!("invalid `tables` at index {i}"))) + }); + + try_join_all(table_futs).await + } +} + +/// Represents an unmodified input ABI of a smart contract. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawAbi { + /// The name of the contract. + pub name: String, + + /// The IPFS link to the JSON ABI of the contract. + pub file: String, +} + +impl RawAbi { + /// Parses, formats, and resolves the input ABI into a valid ABI. + async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + ) -> Result { + let Self { name, file } = self; + + validate_ident(&name).map_err(|e| e.source_context("invalid `name`"))?; + let contract = Self::resolve_contract(logger, link_resolver, file).await?; + + Ok(Abi { name, contract }) + } + + async fn resolve_contract( + logger: &Logger, + link_resolver: &dyn LinkResolver, + file: String, + ) -> Result { + if file.is_empty() { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + } + + let file_bytes = link_resolver + .cat( + &LinkResolverContext::new(&DeploymentHash::default(), &logger), + &(file.into()), + ) + .await + .map_err(|e| Error::FailedToResolveFile(e.context("invalid `file`")))?; + + let contract: JsonAbi = serde_json::from_slice(&file_bytes) + .map_err(|e| Error::InvalidValue(anyhow!(e).context("invalid `file`")))?; + + Ok(contract) + } +} + +/// Represents an unmodified input transformed table that extracts data from source tables for indexing. +/// +/// May contain invalid or partial data. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RawTable { + /// The name of the transformed table. + /// + /// Must reference a valid entity name from the subgraph schema. + pub name: String, + + /// The SQL query that executes on the Amp server. + /// + /// Transforms the execution results into subgraph entities. + pub query: Option, + + /// The IPFS link to the SQL query that executes on the Amp server. + /// + /// Transforms the execution results into subgraph entities. + /// + /// Ignored when `query` is set. + pub file: Option, +} + +impl RawTable { + /// Parses, formats, and resolves the input table into a valid transformed table. + async fn resolve( + self, + logger: &Logger, + link_resolver: &dyn LinkResolver, + amp_client: &impl amp::Client, + network: &str, + source: &Source, + abis: &[Abi], + ) -> Result { + let Self { name, query, file } = self; + + validate_ident(&name).map_err(|e| e.source_context("invalid `name`"))?; + let query = match Self::resolve_query(query, source, abis)? { + Some(query) => query, + None => Self::resolve_file(logger, link_resolver, file, source, abis).await?, + }; + + debug!(logger, "Resolving query schema"); + let schema = Self::resolve_schema(logger, amp_client, &query).await?; + + for field in schema.fields() { + validate_ident(field.name()).map_err(|e| { + e.source_context(format!( + "invalid query output schema: invalid column '{}'", + field.name() + )) + })?; + } + + let block_range_query_builder = Self::resolve_block_range_query_builder( + logger, + amp_client, + network, + source, + query, + schema.clone(), + ) + .await?; + + Ok(Table { + name, + query: block_range_query_builder, + schema, + }) + } + + fn resolve_query( + query: Option, + source: &Source, + abis: &[Abi], + ) -> Result, Error> { + let Some(query) = query else { + return Ok(None); + }; + + if query.is_empty() { + return Err(Error::InvalidValue(anyhow!("`query` cannot be empty"))); + } + + ValidQuery::new( + &query, + source.dataset.as_str(), + source.tables.iter().map(|table| table.as_str()), + &source.address, + abis.iter().map(|abi| (abi.name.as_str(), &abi.contract)), + ) + .map(Some) + .map_err(|e| Error::InvalidValue(e.context("invalid `query`"))) + } + + async fn resolve_file( + logger: &Logger, + link_resolver: &dyn LinkResolver, + file: Option, + source: &Source, + abis: &[Abi], + ) -> Result { + debug!(logger, "Resolving query file"); + + let Some(file) = file else { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + }; + + if file.is_empty() { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + } + + let file_bytes = link_resolver + .cat( + &LinkResolverContext::new(&DeploymentHash::default(), logger), + &(file.into()), + ) + .await + .map_err(|e| Error::FailedToResolveFile(e.context("invalid `file`")))?; + + let query = String::from_utf8(file_bytes) + .map_err(|e| Error::InvalidValue(anyhow!(e).context("invalid `file`")))?; + + if query.is_empty() { + return Err(Error::InvalidValue(anyhow!("`file` cannot be empty"))); + } + + ValidQuery::new( + &query, + source.dataset.as_str(), + source.tables.iter().map(|table| table.as_str()), + &source.address, + abis.iter().map(|abi| (abi.name.as_str(), &abi.contract)), + ) + .map_err(|e| Error::InvalidValue(e.context("invalid `file`"))) + } + + async fn resolve_schema( + logger: &Logger, + amp_client: &impl amp::Client, + query: impl ToString, + ) -> Result { + amp_client + .schema(logger, query) + .await + .map_err(|e| Error::FailedToExecuteQuery { + is_deterministic: e.is_deterministic(), + source: anyhow!(e).context("failed to load schema"), + }) + } + + async fn resolve_block_range_query_builder( + logger: &Logger, + amp_client: &impl amp::Client, + network: &str, + source: &Source, + query: ValidQuery, + schema: Schema, + ) -> Result { + debug!(logger, "Resolving block range query builder"); + + let record_batch = RecordBatch::new_empty(schema.into()); + let (block_number_column, _) = + auto_block_number_decoder(&record_batch).map_err(|e| Error::InvalidQuery(e))?; + + let has_block_hash_column = auto_block_hash_decoder(&record_batch).is_ok(); + let has_block_timestamp_column = auto_block_timestamp_decoder(&record_batch).is_ok(); + + if has_block_hash_column && has_block_timestamp_column { + return Ok(BlockRangeQueryBuilder::new(query, block_number_column)); + } + + debug!(logger, "Resolving context query"); + let mut context_query: Option = None; + + // TODO: Context is embedded in the original query using INNER JOIN to ensure availability for every output row. + // This requires all source tables to match or exceed the expected query output size. + let context_sources_iter = source + .tables + .iter() + .map(|table| (source.dataset.as_str(), table.as_str())) + // TODO: Replace hardcoded values with schema metadata sources when available + .chain(match network { + "ethereum-mainnet" => vec![("edgeandnode/ethereum_mainnet", "blocks")], + "base-mainnet" => vec![("edgeandnode/base_mainnet", "blocks")], + "base-sepolia" => vec![("edgeandnode/base_sepolia", "blocks")], + "arbitrum-one" => vec![("edgeandnode/arbitrum_one", "blocks")], + _ => vec![], + }); + + for (dataset, table) in context_sources_iter { + let context_logger = logger.new(slog::o!( + "context_dataset" => dataset.to_string(), + "context_table" => table.to_string() + )); + debug!(context_logger, "Loading context schema"); + let schema_query = format!("SELECT * FROM {dataset}.{table}"); + let schema = match Self::resolve_schema(logger, amp_client, schema_query).await { + Ok(schema) => schema, + Err(e) => { + error!(context_logger, "Failed to load context schema"; + "e" => ?e + ); + continue; + } + }; + + let record_batch = RecordBatch::new_empty(schema.clone().into()); + let mut columns = Vec::new(); + + if !has_block_hash_column { + let Ok((block_hash_column, _)) = auto_block_hash_decoder(&record_batch) else { + debug!( + context_logger, + "Context schema does not contain block hash column, skipping" + ); + continue; + }; + + columns.push(block_hash_column); + } + + if !has_block_timestamp_column { + let Ok((block_timestamp_column, _)) = auto_block_timestamp_decoder(&record_batch) + else { + debug!( + context_logger, + "Context schema does not contain block timestamp column, skipping" + ); + continue; + }; + + columns.push(block_timestamp_column); + } + + debug!(context_logger, "Creating context query"); + context_query = Some(ContextQuery::new( + query, + block_number_column, + dataset, + table, + columns, + )); + break; + } + + if let Some(context_query) = context_query { + return Ok(BlockRangeQueryBuilder::new_with_context(context_query)); + } + + Err(Error::InvalidQuery(anyhow!( + "query is required to output block numbers, block hashes and block timestamps" + ))) + } +} + +#[derive(Debug, Error)] +pub enum Error { + #[error("invalid value: {0:#}")] + InvalidValue(#[source] anyhow::Error), + + #[error("invalid query: {0:#}")] + InvalidQuery(#[source] anyhow::Error), + + #[error("failed to resolve file: {0:#}")] + FailedToResolveFile(#[source] anyhow::Error), + + #[error("failed to execute query: {source:#}")] + FailedToExecuteQuery { + source: anyhow::Error, + is_deterministic: bool, + }, +} + +impl Error { + /// Extends the source errors with additional context keeping the original error kind and the determinism. + fn source_context(self, cx: impl Into) -> Self { + match self { + Self::InvalidValue(e) => Self::InvalidValue(e.context(cx.into())), + Self::InvalidQuery(e) => Self::InvalidQuery(e.context(cx.into())), + Self::FailedToResolveFile(e) => Self::FailedToResolveFile(e.context(cx.into())), + Self::FailedToExecuteQuery { + source, + is_deterministic, + } => Self::FailedToExecuteQuery { + source: source.context(cx.into()), + is_deterministic, + }, + } + } +} + +impl IsDeterministic for Error { + fn is_deterministic(&self) -> bool { + match self { + Self::InvalidValue(_) => true, + Self::InvalidQuery(_) => true, + Self::FailedToResolveFile(_) => false, + Self::FailedToExecuteQuery { + is_deterministic, .. + } => *is_deterministic, + } + } +} + +fn validate_ident(s: &str) -> Result<(), Error> { + if !regex_is_match!("^[a-zA-Z_][a-zA-Z0-9_-]{0,100}$", s) { + return Err(Error::InvalidValue( + anyhow!("invalid identifier '{s}': must start with a letter or an underscore, and contain only letters, numbers, hyphens, and underscores") + )); + } + Ok(()) +} diff --git a/graph/src/amp/manifest/mod.rs b/graph/src/amp/manifest/mod.rs new file mode 100644 index 00000000000..028d567332c --- /dev/null +++ b/graph/src/amp/manifest/mod.rs @@ -0,0 +1,99 @@ +pub mod data_source; + +use std::sync::Arc; + +use anyhow::{bail, Context, Result}; +use itertools::Itertools; +use semver::Version; +use slog::Logger; + +use crate::{ + amp::Client, + blockchain::Blockchain, + cheap_clone::CheapClone as _, + components::link_resolver::LinkResolver, + data::subgraph::{BaseSubgraphManifest, DeploymentHash, UnresolvedSubgraphManifest}, + data_source::DataSource as GenericDataSource, + schema::InputSchema, +}; + +pub use self::data_source::DataSource; + +/// Represents a valid Amp subgraph manifest. +/// +/// This manifest contains parsed, formatted, and resolved data. +#[derive(Debug, Clone)] +pub struct Manifest { + /// The schema of the subgraph. + /// + /// Contains all the entities, aggregations, and relationships between them. + pub schema: InputSchema, + + /// The Amp data sources of the subgraph. + /// + /// An Amp subgraph can only contain Amp data sources. + pub data_sources: Vec, +} + +impl Manifest { + /// Resolves and returns a valid Amp subgraph manifest. + pub async fn resolve( + logger: &Logger, + link_resolver: Arc, + amp_client: Arc, + max_spec_version: Version, + deployment: DeploymentHash, + raw_manifest: serde_yaml::Mapping, + ) -> Result { + let unresolved_manifest = + UnresolvedSubgraphManifest::::parse(deployment.cheap_clone(), raw_manifest) + .context("failed to parse subgraph manifest")?; + + let resolved_manifest = unresolved_manifest + .resolve( + &deployment, + &link_resolver, + Some(amp_client), + logger, + max_spec_version, + ) + .await + .context("failed to resolve subgraph manifest")?; + + let BaseSubgraphManifest { + id: _, + spec_version: _, + features: _, + description: _, + repository: _, + schema, + data_sources, + graft: _, + templates: _, + chain: _, + indexer_hints: _, + } = resolved_manifest; + + let data_sources_count = data_sources.len(); + let amp_data_sources = data_sources + .into_iter() + .filter_map(|data_source| match data_source { + GenericDataSource::Amp(amp_data_source) => Some(amp_data_source), + _ => None, + }) + .collect_vec(); + + if amp_data_sources.is_empty() { + bail!("invalid subgraph manifest: failed to find Amp data sources"); + } + + if amp_data_sources.len() != data_sources_count { + bail!("invalid subgraph manifest: only Amp data sources are allowed"); + } + + Ok(Self { + schema, + data_sources: amp_data_sources, + }) + } +} diff --git a/graph/src/amp/mod.rs b/graph/src/amp/mod.rs new file mode 100644 index 00000000000..9541d450626 --- /dev/null +++ b/graph/src/amp/mod.rs @@ -0,0 +1,17 @@ +//! This module contains the functionality required to support Amp subgraphs. + +pub mod client; +pub mod codec; +pub mod common; +pub mod error; +pub mod log; +pub mod manifest; +pub mod schema; +pub mod sql; +pub mod stream_aggregator; + +pub use self::{ + client::{flight_client::FlightClient, Client}, + codec::Codec, + manifest::Manifest, +}; diff --git a/graph/src/amp/schema/generator/entity.rs b/graph/src/amp/schema/generator/entity.rs new file mode 100644 index 00000000000..7e3fa5b8f6c --- /dev/null +++ b/graph/src/amp/schema/generator/entity.rs @@ -0,0 +1,171 @@ +use std::fmt; + +use anyhow::{bail, Context, Result}; +use inflector::Inflector; + +use crate::data::store::ValueType; + +/// A minimal representation of a subgraph entity. +pub(super) struct SchemaEntity { + name: String, + fields: Vec, +} + +impl SchemaEntity { + /// Converts the Arrow schema to a subgraph entity. + /// + /// # Errors + /// + /// Returns an error if Arrow fields cannot be converted to subgraph entity fields. + /// + /// The returned error is deterministic. + pub(super) fn new(name: String, arrow_schema: arrow::datatypes::Schema) -> Result { + let mut fields = arrow_schema + .fields() + .iter() + .map(|field| { + SchemaField::new(field) + .with_context(|| format!("failed to create field '{}'", field.name())) + }) + .collect::, _>>()?; + + if !fields + .iter() + .any(|field| field.name.as_str().eq_ignore_ascii_case("id")) + { + fields.push(SchemaField::id()); + } + + fields.sort_unstable_by_key(|field| field.name.clone()); + + Ok(Self { name, fields }) + } +} + +impl fmt::Display for SchemaEntity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write! {f, "type {} @entity(immutable: true)", self.name.to_pascal_case()}?; + write! {f, " {{\n"}?; + for field in &self.fields { + write! {f, "\t{field}\n"}?; + } + write! {f, "}}"} + } +} + +/// A minimal representation of a subgraph entity field. +struct SchemaField { + name: String, + value_type: ValueType, + is_list: bool, + is_required: bool, +} + +impl SchemaField { + /// Converts the Arrow field to a subgraph entity field. + /// + /// # Errors + /// + /// Returns an error if: + /// - The Arrow field has an invalid name + /// - The Arrow field type cannot be converted to a subgraph entity value type + /// + /// The returned error is deterministic. + fn new(arrow_field: &arrow::datatypes::Field) -> Result { + let name = arrow_field.name().to_string(); + let (value_type, is_list) = arrow_data_type_to_value_type(arrow_field.data_type())?; + let is_required = !arrow_field.is_nullable(); + + Ok(Self { + name, + value_type, + is_list, + is_required, + }) + } + + /// Creates an `ID` subgraph entity field. + fn id() -> Self { + Self { + name: "id".to_string(), + value_type: ValueType::Bytes, + is_list: false, + is_required: true, + } + } +} + +impl fmt::Display for SchemaField { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write! {f, "{}: ", self.name.to_camel_case()}?; + if self.is_list { + write! {f, "["}?; + } + write! {f, "{}", self.value_type.to_str()}?; + if self.is_list { + write! {f, "]"}?; + } + if self.is_required { + write! {f, "!"}?; + } + Ok(()) + } +} + +fn arrow_data_type_to_value_type( + data_type: &arrow::datatypes::DataType, +) -> Result<(ValueType, bool)> { + use arrow::datatypes::DataType::*; + + let type_not_supported = || bail!("type '{data_type}' not supported"); + let value_type = match data_type { + Null => return type_not_supported(), + Boolean => ValueType::Boolean, + Int8 => ValueType::Int, + Int16 => ValueType::Int, + Int32 => ValueType::Int, + Int64 => ValueType::Int8, + UInt8 => ValueType::Int, + UInt16 => ValueType::Int, + UInt32 => ValueType::Int8, + UInt64 => ValueType::BigInt, + Float16 => ValueType::BigDecimal, + Float32 => ValueType::BigDecimal, + Float64 => ValueType::BigDecimal, + Timestamp(_, _) => ValueType::Timestamp, + Date32 => ValueType::Timestamp, + Date64 => ValueType::Timestamp, + Time32(_) => return type_not_supported(), + Time64(_) => return type_not_supported(), + Duration(_) => return type_not_supported(), + Interval(_) => return type_not_supported(), + Binary => ValueType::Bytes, + FixedSizeBinary(_) => ValueType::Bytes, + LargeBinary => ValueType::Bytes, + BinaryView => ValueType::Bytes, + Utf8 => ValueType::String, + LargeUtf8 => ValueType::String, + Utf8View => ValueType::String, + List(field) + | ListView(field) + | FixedSizeList(field, _) + | LargeList(field) + | LargeListView(field) => { + if field.data_type().is_nested() { + return type_not_supported(); + } + + return arrow_data_type_to_value_type(field.data_type()) + .map(|(value_type, _)| (value_type, true)); + } + Struct(_) => return type_not_supported(), + Union(_, _) => return type_not_supported(), + Dictionary(_, _) => return type_not_supported(), + Decimal128(_, _) => ValueType::BigDecimal, + Decimal256(_, _) => ValueType::BigDecimal, + Map(_, _) => return type_not_supported(), + RunEndEncoded(_, _) => return type_not_supported(), + }; + + Ok((value_type, false)) +} diff --git a/graph/src/amp/schema/generator/mod.rs b/graph/src/amp/schema/generator/mod.rs new file mode 100644 index 00000000000..117d710adbe --- /dev/null +++ b/graph/src/amp/schema/generator/mod.rs @@ -0,0 +1,65 @@ +mod entity; + +use anyhow::{Context, Result}; +use itertools::Itertools; + +use self::entity::SchemaEntity; +use crate::{data::subgraph::DeploymentHash, schema::InputSchema}; + +/// Generates a subgraph schema from a list of Arrow schemas. +/// +/// # Limitations +/// +/// The generated subgraph entities are immutable and do not contain any relationships to other entities within the schema. +/// +/// # Errors +/// +/// Returns an error if any of the Arrow schemas cannot be represented as valid subgraph entities. +/// +/// The returned error is deterministic. +pub fn generate_subgraph_schema( + deployment_hash: &DeploymentHash, + named_schemas: impl IntoIterator, +) -> Result { + let mut named_schemas = merge_related_schemas(named_schemas)?; + named_schemas.sort_unstable_by_key(|(name, _)| name.clone()); + + let entities = create_entities(named_schemas)?; + let mut subgraph_schema = String::new(); + + for entity in entities { + subgraph_schema.extend(std::iter::once(entity.to_string())); + subgraph_schema.push_str("\n\n"); + } + + let input_schema = InputSchema::parse_latest(&subgraph_schema, deployment_hash.to_owned()) + .context("failed to parse subgraph schema")?; + + Ok(input_schema) +} + +fn merge_related_schemas( + named_schemas: impl IntoIterator, +) -> Result> { + named_schemas + .into_iter() + .into_group_map_by(|(name, _)| name.clone()) + .into_iter() + .map(|(name, related_schemas)| { + let related_schemas = related_schemas.into_iter().map(|(_, schema)| schema); + + arrow::datatypes::Schema::try_merge(related_schemas).map(|schema| (name, schema)) + }) + .collect::, _>>() + .context("failed to merge schemas of related SQL queries") +} + +fn create_entities(queries: Vec<(String, arrow::datatypes::Schema)>) -> Result> { + queries + .into_iter() + .map(|(name, schema)| { + SchemaEntity::new(name.clone(), schema) + .with_context(|| format!("failed to create entity '{}'", name)) + }) + .collect::, _>>() +} diff --git a/graph/src/amp/schema/mod.rs b/graph/src/amp/schema/mod.rs new file mode 100644 index 00000000000..546777a14ff --- /dev/null +++ b/graph/src/amp/schema/mod.rs @@ -0,0 +1,3 @@ +mod generator; + +pub use self::generator::generate_subgraph_schema; diff --git a/graph/src/amp/sql/mod.rs b/graph/src/amp/sql/mod.rs new file mode 100644 index 00000000000..02355895afa --- /dev/null +++ b/graph/src/amp/sql/mod.rs @@ -0,0 +1,3 @@ +pub mod query_builder; + +pub use self::query_builder::{BlockRangeQueryBuilder, ContextQuery, ValidQuery}; diff --git a/graph/src/amp/sql/query_builder/block_range_query.rs b/graph/src/amp/sql/query_builder/block_range_query.rs new file mode 100644 index 00000000000..e9b91ca5136 --- /dev/null +++ b/graph/src/amp/sql/query_builder/block_range_query.rs @@ -0,0 +1,189 @@ +use std::{ + collections::BTreeMap, + hash::{BuildHasher, Hash, Hasher}, + ops::{ControlFlow, RangeInclusive}, +}; + +use ahash::RandomState; +use alloy::primitives::BlockNumber; +use sqlparser_latest::ast::{self, VisitMut, VisitorMut}; + +use super::{extract_tables, parse_query, TableReference}; + +/// Limits the query execution to the specified block range. +/// +/// Wraps the `query` in a CTE, and creates CTEs for every table it references. +/// These CTEs load data from the referenced tables only on the specified block range. +/// All the table references in the original SQL query are replaced with the created CTE names. +/// +/// The output is ordered by block numbers. +pub(super) fn new_block_range_query<'a>( + query: &ast::Query, + block_number_column: &str, + block_range: &RangeInclusive, +) -> ast::Query { + // CTE names are unique within a SQL query. + // The hasher ensures that CTEs created for block range do not collide with user-defined CTEs. + // Constant seeds ensure consistent block range queries for the same input parameters. + let mut hasher = RandomState::with_seeds(0, 0, 0, 0).build_hasher(); + + let tables_to_ctes_mapping = new_tables_to_ctes_mapping(query, &mut hasher); + assert!(!tables_to_ctes_mapping.is_empty()); + + let mut cte_tables = Vec::with_capacity(tables_to_ctes_mapping.len()); + for (table, cte_table) in &tables_to_ctes_mapping { + cte_tables.push(format!( + "{cte_table} AS (SELECT * FROM {table} WHERE _block_num BETWEEN {start_block} AND {end_block})", + start_block = block_range.start(), + end_block = block_range.end() + )) + } + + let mut query = query.clone(); + let mut table_replacer = TableReplacer::new(tables_to_ctes_mapping); + let _: ControlFlow<()> = VisitMut::visit(&mut query, &mut table_replacer); + + let block_range_query = format!( + "WITH {cte_tables}, {source} AS ({query}) SELECT {source}.* FROM {source} ORDER BY {source}.{block_number_column}", + cte_tables = cte_tables.join(", "), + source = format!("source_{}", hasher.finish()) + ); + + parse_query(block_range_query).unwrap() +} + +/// Creates unique CTE names for every table referenced by the SQL query. +fn new_tables_to_ctes_mapping( + query: &ast::Query, + hasher: &mut impl Hasher, +) -> BTreeMap { + extract_tables(query) + .into_iter() + .map(|table| { + table.hash(hasher); + + (table, format!("block_range_{}", hasher.finish())) + }) + .collect() +} + +/// Visits the SQL query AST and replaces referenced table names with CTE names. +struct TableReplacer { + tables_to_ctes_mapping: BTreeMap, +} + +impl TableReplacer { + /// Creates a new table replacer. + fn new(tables_to_ctes_mapping: BTreeMap) -> Self { + Self { + tables_to_ctes_mapping, + } + } + + /// Replaces the table name of the current `table_factor` with the associated CTE name. + fn visit_table_factor(&mut self, table_factor: &mut ast::TableFactor) { + let ast::TableFactor::Table { name, alias, .. } = table_factor else { + return; + }; + + let Some(cte_table) = self + .tables_to_ctes_mapping + .get(&TableReference::with_object_name(name)) + else { + return; + }; + + // Set the alias to the original table name so that queries like `SELECT table.column FROM table` do not break + if alias.is_none() { + let last_name_part = name.0.last().unwrap(); + + *alias = Some(ast::TableAlias { + name: last_name_part.as_ident().unwrap().clone(), + columns: Vec::new(), + }) + } + + *name = ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( + cte_table, + ))]); + } +} + +impl VisitorMut for TableReplacer { + type Break = (); + + fn pre_visit_table_factor( + &mut self, + table_factor: &mut ast::TableFactor, + ) -> ControlFlow { + self.visit_table_factor(table_factor); + ControlFlow::Continue(()) + } +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + #[test] + fn query_with_one_table_reference_is_wrapped_with_block_range() { + let query = parse_query("SELECT a, b, c FROM d").unwrap(); + let block_number_column = "b"; + let block_range = 0..=1_000_000; + let block_range_query = new_block_range_query(&query, block_number_column, &block_range); + + assert_eq!( + block_range_query, + parse_query( + r#" + WITH block_range_1164572571450379730 AS ( + SELECT * FROM "d" WHERE _block_num BETWEEN 0 AND 1000000 + ), + source_1164572571450379730 AS ( + SELECT a, b, c FROM block_range_1164572571450379730 AS d + ) + SELECT + source_1164572571450379730.* + FROM + source_1164572571450379730 + ORDER BY + source_1164572571450379730.b + "# + ) + .unwrap(), + ) + } + + #[test] + fn query_with_multiple_table_references_is_wrapped_with_block_range() { + let query = parse_query("SELECT a, b, c FROM d JOIN e ON e.e = d.d").unwrap(); + let block_number_column = "b"; + let block_range = 0..=1_000_000; + let block_range_query = new_block_range_query(&query, block_number_column, &block_range); + + assert_eq!( + block_range_query, + parse_query( + r#" + WITH block_range_1164572571450379730 AS ( + SELECT * FROM "d" WHERE _block_num BETWEEN 0 AND 1000000 + ), + block_range_13063992259633584610 AS ( + SELECT * FROM "e" WHERE _block_num BETWEEN 0 AND 1000000 + ), + source_13063992259633584610 AS ( + SELECT a, b, c FROM block_range_1164572571450379730 AS d JOIN block_range_13063992259633584610 AS e ON e.e = d.d + ) + SELECT + source_13063992259633584610.* + FROM + source_13063992259633584610 + ORDER BY + source_13063992259633584610.b + "# + ) + .unwrap(), + ) + } +} diff --git a/graph/src/amp/sql/query_builder/context_query.rs b/graph/src/amp/sql/query_builder/context_query.rs new file mode 100644 index 00000000000..cdff33ca4a3 --- /dev/null +++ b/graph/src/amp/sql/query_builder/context_query.rs @@ -0,0 +1,103 @@ +use ahash::RandomState; +use itertools::Itertools; +use sqlparser_latest::ast; + +use super::parse_query; + +/// Wraps the SQL query with additional context columns from a separate dataset. +/// +/// Creates two CTEs: one wrapping the input `query` and another loading context columns +/// from the specified context dataset and table. Joins both CTEs on block numbers to +/// include the context columns in the original query's output. +/// +/// This enables including columns required by Amp subgraphs in the original SQL query. +pub(super) fn new_context_query<'a>( + query: &ast::Query, + block_number_column: &str, + context_dataset: &str, + context_table: &str, + context_columns: impl IntoIterator, +) -> ast::Query { + // CTE names are unique within a SQL query. + // The hasher ensures that CTEs created for context do not collide with user-defined CTEs. + // Constant seeds ensure consistent context queries for the same input parameters. + let hasher = RandomState::with_seeds(0, 0, 0, 0); + let query_hash = hasher.hash_one(query); + + let context_columns = context_columns.into_iter().collect_vec(); + assert!(!context_columns.is_empty()); + + let context_cte = format!("context_{query_hash}"); + let source_cte = format!("source_{query_hash}"); + + let context_query = format!( + " + WITH {context_cte} AS ( + SELECT DISTINCT _block_num, {input_context_columns} FROM {context_dataset}.{context_table} + ), + {source_cte} AS ( + {query} + ) + SELECT + {output_context_columns}, + {source_cte}.* + FROM + {source_cte} + INNER JOIN {context_cte} ON + {context_cte}._block_num = {source_cte}.{block_number_column} + ", + input_context_columns = context_columns.join(", "), + output_context_columns = context_columns + .iter() + .map(|context_column| format!("{context_cte}.{context_column}")) + .join(", "), + ); + + parse_query(context_query).unwrap() +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + #[test] + fn query_is_wrapped_with_context() { + let query = parse_query("SELECT a, b, c FROM d").unwrap(); + let block_number_column = "b"; + let context_dataset = "cx_a"; + let context_table = "cx_b"; + let context_columns = ["cx_c", "cx_d"]; + + let context_query = new_context_query( + &query, + block_number_column, + context_dataset, + context_table, + context_columns, + ); + + assert_eq!( + context_query, + parse_query( + " + WITH context_10500256449332496249 AS ( + SELECT DISTINCT _block_num, cx_c, cx_d FROM cx_a.cx_b + ), + source_10500256449332496249 AS ( + SELECT a, b, c FROM d + ) + SELECT + context_10500256449332496249.cx_c, + context_10500256449332496249.cx_d, + source_10500256449332496249.* + FROM + source_10500256449332496249 + INNER JOIN context_10500256449332496249 ON + context_10500256449332496249._block_num = source_10500256449332496249.b + " + ) + .unwrap() + ) + } +} diff --git a/graph/src/amp/sql/query_builder/event_signature_resolver.rs b/graph/src/amp/sql/query_builder/event_signature_resolver.rs new file mode 100644 index 00000000000..89ab8a31a51 --- /dev/null +++ b/graph/src/amp/sql/query_builder/event_signature_resolver.rs @@ -0,0 +1,183 @@ +use std::ops::ControlFlow; + +use alloy::json_abi::JsonAbi; +use anyhow::{bail, Context, Result}; +use sqlparser_latest::ast::{self, visit_expressions_mut}; + +static FUNCTION_NAME: &str = "sg_event_signature"; + +/// Replaces `sg_event_signature('CONTRACT_NAME', 'EVENT_NAME')` function calls with +/// the correct event signature based on `abis`. +/// +/// # Errors +/// +/// Returns an error if: +/// - The function is called with incorrect arguments +/// - The contract name is not found in `abis` +/// - The event name is not found in `abis` +/// +/// The returned error is deterministic. +pub(super) fn resolve_event_signatures( + query: &mut ast::Query, + abis: &[(&str, &JsonAbi)], +) -> Result<()> { + let visit_result = visit_expressions_mut(query, |expr| match visit_expr(expr, abis) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + }); + + if let ControlFlow::Break(e) = visit_result { + return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); + } + + Ok(()) +} + +fn visit_expr(expr: &mut ast::Expr, abis: &[(&str, &JsonAbi)]) -> Result<()> { + let ast::Expr::Function(function) = expr else { + return Ok(()); + }; + + if !FUNCTION_NAME.eq_ignore_ascii_case(&function.name.to_string()) { + return Ok(()); + } + + let Some((contract_name, event_name)) = get_args(function) else { + bail!("invalid function call: expected `{FUNCTION_NAME}('CONTRACT_NAME', 'EVENT_NAME')`, found: `{function}`"); + }; + + let Some(event) = get_event(abis, contract_name, event_name) else { + bail!("invalid function call: unknown contract '{contract_name}' or event '{event_name}'"); + }; + + let signature = ast::Value::SingleQuotedString(event.full_signature()).with_empty_span(); + *expr = ast::Expr::Value(signature); + + Ok(()) +} + +fn get_args<'a>(function: &'a ast::Function) -> Option<(&'a str, &'a str)> { + let ast::FunctionArguments::List(args) = &function.args else { + return None; + }; + + if args.args.len() != 2 { + return None; + } + + match (get_arg(&args.args[0]), get_arg(&args.args[1])) { + (Some(contract_name), Some(event_name)) => Some((contract_name, event_name)), + _ => None, + } +} + +fn get_arg<'a>(arg: &'a ast::FunctionArg) -> Option<&'a str> { + let ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(expr)) = arg else { + return None; + }; + + match expr { + ast::Expr::Value(ast::ValueWithSpan { + value: ast::Value::SingleQuotedString(value), + .. + }) if !value.is_empty() => Some(value), + _ => None, + } +} + +fn get_event<'a>( + abis: &'a [(&str, &JsonAbi)], + contract_name: &str, + event_name: &str, +) -> Option<&'a alloy::json_abi::Event> { + abis.iter() + .filter(|(name, _)| *name == contract_name) + .map(|(_, contract)| contract.event(event_name)) + .flatten() + .map(|events| events.first()) + .flatten() + .next() +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + use self::fixtures::*; + + mod fixtures { + use std::sync::LazyLock; + + use super::*; + + pub(super) static ABIS: LazyLock> = LazyLock::new(|| { + vec![ + ("ContractA", JsonAbi::parse([&*event("TransferA")]).unwrap()), + ("ContractB", JsonAbi::parse([&*event("TransferB")]).unwrap()), + ("ContractB", JsonAbi::parse([&*event("TransferC")]).unwrap()), + ] + }); + + pub(super) fn event(name: &str) -> String { + format!("event {name}(address indexed from, address indexed to, address value)") + } + } + + macro_rules! test_resolve_event_signatures { + ($($name:ident: $query:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let mut query = parse_query($query).unwrap(); + let abis = ABIS.iter().map(|abi| (abi.0, &abi.1)).collect::>(); + let result = resolve_event_signatures(&mut query, &abis); + + match $expected { + Result::<&str, ()>::Ok(expected) => { + result.unwrap(); + assert_eq!(query, parse_query(expected).unwrap()); + }, + Err(_) => { + result.unwrap_err(); + } + } + } + )* + }; + } + + test_resolve_event_signatures! { + nothing_to_resolve: "SELECT a FROM b" => Ok("SELECT a FROM b"), + + call_with_no_arguments: "SELECT a FROM b WHERE c = sg_event_signature()" => Err(()), + call_with_one_argument: "SELECT a FROM b WHERE c = sg_event_signature('ContractA')" => Err(()), + call_with_first_invalid_argument: "SELECT a FROM b WHERE c = sg_event_signature(ContractA, 'TransferA')" => Err(()), + call_with_second_invalid_argument: "SELECT a FROM b WHERE c = sg_event_signature('ContractA', TransferA)" => Err(()), + call_with_two_invalid_arguments: "SELECT a FROM b WHERE c = sg_event_signature(ContractA, TransferA)" => Err(()), + call_with_unknown_contract: "SELECT a FROM b WHERE c = sg_event_signature('ContractX', 'TransferA')" => Err(()), + call_with_unknown_event: "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferX')" => Err(()), + call_with_contract_and_event_mismatch: "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferB')" => Err(()), + call_with_invalid_argument_cases: "SELECT a FROM b WHERE c = sg_event_signature('contractA', 'transferA')" => Err(()), + + resolve_one_call: + "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferA')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}'", event("TransferA"))), + + resolve_multiple_calls: + "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferA') OR d = sg_event_signature('ContractA', 'TransferA')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}' OR d = '{}'", event("TransferA"), event("TransferA"))), + + resolve_multiple_calls_with_different_arguments: + "SELECT a FROM b WHERE c = sg_event_signature('ContractA', 'TransferA') OR d = sg_event_signature('ContractB', 'TransferB')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}' OR d = '{}'", event("TransferA"), event("TransferB"))), + + resolve_multiple_calls_with_events_from_different_abis_with_the_same_name: + "SELECT a FROM b WHERE c = sg_event_signature('ContractB', 'TransferB') OR d = sg_event_signature('ContractB', 'TransferC')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}' OR d = '{}'", event("TransferB"), event("TransferC"))), + + resolve_calls_with_case_insensitive_function_name: + "SELECT a FROM b WHERE c = sg_Event_SIGNATURE('ContractA', 'TransferA')" => + Ok(&*format!("SELECT a FROM b WHERE c = '{}'", event("TransferA"))), + } +} diff --git a/graph/src/amp/sql/query_builder/mod.rs b/graph/src/amp/sql/query_builder/mod.rs new file mode 100644 index 00000000000..5f5458ec092 --- /dev/null +++ b/graph/src/amp/sql/query_builder/mod.rs @@ -0,0 +1,191 @@ +mod block_range_query; +mod context_query; +mod event_signature_resolver; +mod parser; +mod source_address_resolver; +mod table_extractor; +mod table_validator; + +use std::{fmt, ops::RangeInclusive}; + +use alloy::{ + json_abi::JsonAbi, + primitives::{Address, BlockNumber}, +}; +use anyhow::{bail, Context, Result}; +use itertools::Itertools; +use sqlparser_latest::ast; + +use self::{ + block_range_query::new_block_range_query, + context_query::new_context_query, + event_signature_resolver::resolve_event_signatures, + parser::parse_query, + source_address_resolver::resolve_source_address, + table_extractor::{extract_tables, TableReference}, + table_validator::validate_tables, +}; + +/// Represents a valid SQL query that can be executed on an Amp server. +#[derive(Debug, Clone)] +pub struct ValidQuery { + query: ast::Query, +} + +impl ValidQuery { + /// Parses, validates and resolves the input SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - The SQL query cannot be parsed + /// - The SQL query is not valid + /// - The SQL query cannot be resolved + /// + /// The returned error is deterministic. + pub fn new<'a>( + sql: &str, + dataset: &str, + tables: impl IntoIterator, + source_address: &Address, + abis: impl IntoIterator, + ) -> Result { + let mut query = parse_query(sql).context("failed to parse SQL query")?; + + Self::validate(&query, dataset, tables).context("failed to validate SQL query")?; + Self::resolve(&mut query, source_address, abis).context("failed to resolve SQL query")?; + + Ok(Self { query }) + } + + /// Validates the SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - The SQL query references unknown datasets or tables + /// - The SQL query uses custom `SETTINGS` + /// + /// The returned error is deterministic. + fn validate<'a>( + query: &ast::Query, + dataset: &str, + tables: impl IntoIterator, + ) -> Result<()> { + validate_tables(query, dataset, tables)?; + + if query.settings.is_some() { + bail!("custom SETTINGS are not allowed"); + } + + Ok(()) + } + + /// Resolves subgraph-specific function calls in the SQL query. + /// + /// # Errors + /// + /// Returns an error if: + /// - Source address function calls cannot be resolved + /// - Event signature function calls cannot be resolved + /// + /// The returned error is deterministic. + fn resolve<'a>( + query: &mut ast::Query, + source_address: &Address, + abis: impl IntoIterator, + ) -> Result<()> { + resolve_source_address(query, source_address)?; + resolve_event_signatures(query, &abis.into_iter().collect_vec())?; + + Ok(()) + } +} + +impl fmt::Display for ValidQuery { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.query) + } +} + +/// Represents a valid SQL query that contains columns required by Amp subgraphs. +#[derive(Debug, Clone)] +pub struct ContextQuery { + query: ast::Query, + block_number_column: String, +} + +impl ContextQuery { + /// Wraps the SQL query with additional context columns from a separate dataset. + /// + /// Creates two CTEs: one wrapping the input `query` and another loading context columns + /// from the specified context dataset and table. Joins both CTEs on block numbers to + /// include the context columns in the original query's output. + /// + /// This enables including columns required by Amp subgraphs in the original SQL query. + pub fn new<'a>( + valid_query: ValidQuery, + block_number_column: &str, + context_dataset: &str, + context_table: &str, + context_columns: impl IntoIterator, + ) -> Self { + let ValidQuery { query } = valid_query; + + let query = new_context_query( + &query, + block_number_column, + context_dataset, + context_table, + context_columns, + ); + + Self { + query, + block_number_column: block_number_column.to_string(), + } + } +} + +/// Builds valid SQL queries for execution on an Amp server with block range limits. +#[derive(Debug, Clone)] +pub struct BlockRangeQueryBuilder { + query: ast::Query, + block_number_column: String, +} + +impl BlockRangeQueryBuilder { + /// Creates a new block range query builder with the specified valid SQL query. + pub fn new(valid_query: ValidQuery, block_number_column: &str) -> Self { + let ValidQuery { query } = valid_query; + + Self { + query, + block_number_column: block_number_column.to_string(), + } + } + + /// Creates a new block range query builder with the specified context SQL query. + pub fn new_with_context(context_query: ContextQuery) -> Self { + let ContextQuery { + query, + block_number_column, + } = context_query; + + Self { + query, + block_number_column, + } + } + + /// Limits the query execution to the specified block range. + /// + /// Wraps this SQL query in a CTE, and creates CTEs for every table it references. + /// These CTEs load data from the referenced tables only on the specified block range. + /// All the table references in the original SQL query are replaced with the created CTE names. + /// + /// The output is ordered by block numbers. + pub fn build_with_block_range(&self, block_range: &RangeInclusive) -> String { + new_block_range_query(&self.query, &self.block_number_column, block_range).to_string() + } +} diff --git a/graph/src/amp/sql/query_builder/parser.rs b/graph/src/amp/sql/query_builder/parser.rs new file mode 100644 index 00000000000..1f965b955b6 --- /dev/null +++ b/graph/src/amp/sql/query_builder/parser.rs @@ -0,0 +1,115 @@ +use std::ops::ControlFlow; + +use anyhow::{anyhow, bail, Context, Result}; +use itertools::Itertools; +use sqlparser_latest::{ + ast::{self, Visit, Visitor}, + dialect::GenericDialect, + parser::Parser, +}; + +/// Parses a SQL query and returns its AST. +/// +/// # Errors +/// +/// Returns an error if: +/// - The SQL query cannot be parsed +/// - The SQL query contains multiple SQL statements +/// - The SQL query is not a `SELECT` query +/// +/// The returned error is deterministic. +pub(super) fn parse_query(s: impl AsRef) -> Result { + let statement = Parser::parse_sql(&GenericDialect {}, s.as_ref()) + .context("invalid SQL query")? + .into_iter() + .exactly_one() + .map_err(|e| anyhow!("expected exactly one SQL statement, found {}", e.count()))?; + + let query = match statement { + ast::Statement::Query(query) => *query, + _ => bail!("invalid SQL query: only SELECT statements are allowed"), + }; + + if let ControlFlow::Break(e) = query.visit(&mut AllowOnlySelectQueries) { + return Err(e); + } + + Ok(query) +} + +/// Validates that the SQL query AST contains only `SELECT` queries in subqueries. +struct AllowOnlySelectQueries; + +impl AllowOnlySelectQueries { + /// Returns an error if the `set_expr` is not a `SELECT` expression. + fn visit_set_expr(&self, set_expr: &ast::SetExpr) -> Result<()> { + match set_expr { + ast::SetExpr::Select(_) + | ast::SetExpr::Query(_) + | ast::SetExpr::Values(_) + | ast::SetExpr::Table(_) => Ok(()), + ast::SetExpr::SetOperation { left, right, .. } => { + self.visit_set_expr(left)?; + self.visit_set_expr(right)?; + Ok(()) + } + ast::SetExpr::Insert(_) | ast::SetExpr::Update(_) | ast::SetExpr::Delete(_) => { + bail!("invalid SQL query: only SELECT queries are allowed") + } + } + } +} + +impl Visitor for AllowOnlySelectQueries { + type Break = anyhow::Error; + + fn pre_visit_query(&mut self, query: &ast::Query) -> ControlFlow { + match self.visit_set_expr(&query.body) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! test_parse_query { + ($($name:ident: $input:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let result = parse_query($input); + + match $expected { + Result::<&str, &str>::Ok(expected) => { + assert_eq!(result.unwrap().to_string(), expected); + }, + Err(e) => { + assert_eq!(result.unwrap_err().to_string(), e); + } + } + } + )* + }; + } + + test_parse_query! { + invalid_query: "SELECT" => Err("invalid SQL query"), + multiple_statements: "SELECT a FROM b; SELECT c FROM d" => Err("expected exactly one SQL statement, found 2"), + insert_statement: "INSERT INTO a VALUES (b)" => Err("invalid SQL query: only SELECT statements are allowed"), + update_statement: "UPDATE a SET b = c" => Err("invalid SQL query: only SELECT statements are allowed"), + delete_statement: "DELETE FROM a WHERE b = c" => Err("invalid SQL query: only SELECT statements are allowed"), + truncate_statement: "TRUNCATE TABLE a" => Err("invalid SQL query: only SELECT statements are allowed"), + drop_statement: "DROP TABLE a" => Err("invalid SQL query: only SELECT statements are allowed"), + + nested_insert_query: "WITH a AS (INSERT INTO b VALUES (c) RETURNING d) SELECT * FROM a" => Err("invalid SQL query: only SELECT queries are allowed"), + nested_update_query: "WITH a AS (UPDATE b SET c = d RETURNING e) SELECT * FROM a" => Err("invalid SQL query: only SELECT queries are allowed"), + nested_delete_query: "WITH a AS (DELETE FROM b WHERE c = d RETURNING e) SELECT * FROM a" => Err("invalid SQL query: only SELECT queries are allowed"), + + valid_query: "SELECT a FROM b" => Ok("SELECT a FROM b"), + valid_query_with_cte: "WITH a AS (SELECT b FROM c) SELECT * FROM a" => Ok("WITH a AS (SELECT b FROM c) SELECT * FROM a"), + valid_query_with_join: "SELECT a FROM b INNER JOIN c ON c.c = b.b" => Ok("SELECT a FROM b INNER JOIN c ON c.c = b.b"), + } +} diff --git a/graph/src/amp/sql/query_builder/source_address_resolver.rs b/graph/src/amp/sql/query_builder/source_address_resolver.rs new file mode 100644 index 00000000000..579e0873bb6 --- /dev/null +++ b/graph/src/amp/sql/query_builder/source_address_resolver.rs @@ -0,0 +1,133 @@ +use std::ops::ControlFlow; + +use alloy::primitives::Address; +use anyhow::{bail, Context, Result}; +use sqlparser_latest::ast::{self, visit_expressions_mut}; + +static FUNCTION_NAME: &str = "sg_source_address"; + +/// Replaces `sg_source_address()` function calls in the SQL query with the `source_address`. +/// +/// # Errors +/// +/// Returns an error if the function is called with any arguments. +/// +/// The returned error is deterministic. +pub(super) fn resolve_source_address( + query: &mut ast::Query, + source_address: &Address, +) -> Result<()> { + let visit_result = + visit_expressions_mut(query, |expr| match visit_expr(expr, source_address) { + Ok(()) => ControlFlow::Continue(()), + Err(e) => ControlFlow::Break(e), + }); + + if let ControlFlow::Break(e) = visit_result { + return Err(e).with_context(|| format!("failed to resolve '{FUNCTION_NAME}' calls")); + } + + Ok(()) +} + +fn visit_expr(expr: &mut ast::Expr, source_address: &Address) -> Result<()> { + let ast::Expr::Function(function) = expr else { + return Ok(()); + }; + + if !FUNCTION_NAME.eq_ignore_ascii_case(&function.name.to_string()) { + return Ok(()); + } + + match &function.args { + ast::FunctionArguments::None => {} + ast::FunctionArguments::List(args) if args.args.is_empty() => {} + _ => { + bail!("invalid function call: function '{FUNCTION_NAME}' does not accept arguments"); + } + } + + *function = ast::Function { + name: ast::ObjectName(vec![ast::ObjectNamePart::Identifier(ast::Ident::new( + "arrow_cast", + ))]), + uses_odbc_syntax: false, + parameters: ast::FunctionArguments::None, + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args: vec![ + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( + ast::Value::HexStringLiteral(hex::encode(source_address)).with_empty_span(), + ))), + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(ast::Expr::Value( + ast::Value::SingleQuotedString("FixedSizeBinary(20)".to_string()) + .with_empty_span(), + ))), + ], + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + }; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + use self::fixtures::*; + + mod fixtures { + use super::*; + + pub(super) const SOURCE_ADDRESS: Address = Address::ZERO; + + pub(super) const RESOLVED_FUNCTION_CALL: &str = + "arrow_cast(X'0000000000000000000000000000000000000000', 'FixedSizeBinary(20)')"; + } + + macro_rules! test_resolve_source_address { + ($($name:ident: $query:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let mut query = parse_query($query).unwrap(); + let result = resolve_source_address(&mut query, &SOURCE_ADDRESS); + + match $expected { + Result::<&str, ()>::Ok(expected) => { + result.unwrap(); + assert_eq!(query, parse_query(expected).unwrap()); + }, + Err(_) => { + result.unwrap_err(); + } + } + } + )* + }; + } + + test_resolve_source_address! { + nothing_to_resolve: "SELECT a FROM b" => Ok("SELECT a FROM b"), + call_with_one_argument: "SELECT a FROM b WHERE c = sg_source_address(d)" => Err(()), + call_with_multiple_argument: "SELECT a FROM b WHERE c = sg_source_address(d, e)" => Err(()), + + resolve_one_call: + "SELECT a FROM b WHERE c = sg_source_address()" => + Ok(&*format!("SELECT a FROM b WHERE c = {RESOLVED_FUNCTION_CALL}")), + + resolve_multiple_calls: + "SELECT a FROM b WHERE c = sg_source_address() OR d = sg_source_address()" => + Ok(&*format!("SELECT a FROM b WHERE c = {RESOLVED_FUNCTION_CALL} OR d = {RESOLVED_FUNCTION_CALL}")), + + resolve_calls_with_case_insensitive_function_name: + "SELECT a FROM b WHERE c = sg_Source_ADDRESS()" => + Ok(&*format!("SELECT a FROM b WHERE c = {RESOLVED_FUNCTION_CALL}")), + } +} diff --git a/graph/src/amp/sql/query_builder/table_extractor.rs b/graph/src/amp/sql/query_builder/table_extractor.rs new file mode 100644 index 00000000000..b3cbc9d9d03 --- /dev/null +++ b/graph/src/amp/sql/query_builder/table_extractor.rs @@ -0,0 +1,207 @@ +use std::{collections::BTreeSet, fmt, ops::ControlFlow}; + +use sqlparser_latest::ast::{self, Visit, Visitor}; + +/// Returns all tables that are referenced by the SQL query. +/// +/// The table names are lowercased and quotes are ignored. +pub(super) fn extract_tables(query: &ast::Query) -> BTreeSet { + let mut table_extractor = TableExtractor::new(); + let _: ControlFlow<()> = Visit::visit(query, &mut table_extractor); + + table_extractor.tables +} + +/// Contains a normalized table reference. +/// +/// Used to compare physical table references with CTE names and custom tables. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(super) struct TableReference(ast::ObjectName); + +impl TableReference { + const QUOTE_STYLE: char = '"'; + + /// Creates a new table reference from a custom dataset and table. + pub(super) fn new(dataset: &str, table: &str) -> Self { + Self( + vec![ + ast::Ident::with_quote(Self::QUOTE_STYLE, dataset), + ast::Ident::with_quote(Self::QUOTE_STYLE, table), + ] + .into(), + ) + } + + /// Creates a new table reference from an object name. + pub(super) fn with_object_name(object_name: &ast::ObjectName) -> Self { + Self::with_idents( + object_name + .0 + .iter() + .map(|object_name_part| match object_name_part { + ast::ObjectNamePart::Identifier(ident) => ident, + }), + ) + } + + /// Creates a new table reference from a list of identifiers. + pub(super) fn with_idents<'a>(idents: impl IntoIterator) -> Self { + Self( + idents + .into_iter() + .map(|ident| { + let ast::Ident { + value, + quote_style, + span: _, + } = ident; + + ast::Ident::with_quote(Self::QUOTE_STYLE, { + if quote_style.is_none() { + value.to_lowercase() + } else { + value.to_owned() + } + }) + }) + .collect::>() + .into(), + ) + } +} + +impl fmt::Display for TableReference { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +/// Visits the SQL query AST and extracts referenced table names, ignoring CTEs. +struct TableExtractor { + tables: BTreeSet, + cte_stack: CteStack, +} + +impl TableExtractor { + /// Creates a new empty table extractor. + fn new() -> Self { + Self { + tables: BTreeSet::new(), + cte_stack: CteStack::new(), + } + } + + /// Extracts and stores the table name from the current `table_factor`. + fn visit_table_factor(&mut self, table_factor: &ast::TableFactor) { + let ast::TableFactor::Table { name, .. } = table_factor else { + return; + }; + + let table_reference = TableReference::with_object_name(name); + if self.cte_stack.contains(&table_reference) { + return; + } + + self.tables.insert(table_reference); + } +} + +impl Visitor for TableExtractor { + type Break = (); + + fn pre_visit_query(&mut self, query: &ast::Query) -> ControlFlow { + self.cte_stack.pre_visit_query(query); + ControlFlow::Continue(()) + } + + fn post_visit_query(&mut self, _query: &ast::Query) -> ControlFlow { + self.cte_stack.post_visit_query(); + ControlFlow::Continue(()) + } + + fn pre_visit_table_factor( + &mut self, + table_factor: &ast::TableFactor, + ) -> ControlFlow { + self.visit_table_factor(table_factor); + ControlFlow::Continue(()) + } +} + +/// Maintains a list of active CTEs for each subquery scope. +struct CteStack { + stack: Vec>, +} + +impl CteStack { + /// Creates a new empty CTE stack. + fn new() -> Self { + Self { stack: Vec::new() } + } + + /// Returns `true` if the `table_reference` is present in the CTE list at any scope. + fn contains(&self, table_reference: &TableReference) -> bool { + self.stack + .iter() + .any(|scope| scope.contains(table_reference)) + } + + /// Creates a new subquery scope with all the CTEs of the current `query`. + fn pre_visit_query(&mut self, query: &ast::Query) { + let cte_tables = match &query.with { + Some(with) => with + .cte_tables + .iter() + .map(|cte_table| TableReference::with_idents([&cte_table.alias.name])) + .collect(), + None => BTreeSet::new(), + }; + + self.stack.push(cte_tables); + } + + /// Removes all the CTEs from the most recent subquery scope. + fn post_visit_query(&mut self) { + self.stack.pop(); + } +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + macro_rules! test_extract_tables { + ($($name:ident: $input:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let query = parse_query($input).unwrap(); + assert_eq!( + extract_tables(&query).into_iter().map(|table| table.to_string()).collect::>(), + $expected.into_iter().map(|table| table.to_string()).collect::>() + ); + } + )* + }; + } + + test_extract_tables! { + one_table: "SELECT a FROM b" => [r#""b""#], + multiple_tables_with_one_join: "SELECT a FROM b JOIN c ON c.c = b.b" => [r#""b""#, r#""c""#], + multiple_tables_with_multiple_joins: "SELECT a FROM b JOIN c ON c.c = b.b JOIN d ON d.d = b.b" => [r#""b""#, r#""c""#, r#""d""#], + one_table_with_one_cte: "WITH a AS (SELECT * FROM b) SELECT * FROM a" => [r#""b""#], + one_table_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM a) SELECT * FROM c" => [r#""b""#], + multiple_tables_with_multiple_ctes: "WITH a AS (SELECT * FROM b), c AS (SELECT * FROM d) SELECT * FROM a JOIN c ON c.c = a.a" => [r#""b""#, r#""d""#], + multiple_tables_with_nested_ctes: "WITH a AS (WITH b AS (SELECT * FROM c) SELECT * FROM d JOIN b ON b.b = d.d) SELECT * FROM a" => [r#""c""#, r#""d""#], + multiple_tables_with_union: "SELECT a FROM b UNION SELECT c FROM d" => [r#""b""#, r#""d""#], + multiple_tables_with_union_all: "SELECT a FROM b UNION ALL SELECT c FROM d" => [r#""b""#, r#""d""#], + + namespace_is_preserved: "SELECT a FROM b.c" => [r#""b"."c""#], + catalog_is_preserved: "SELECT a FROM b.c.d" => [r#""b"."c"."d""#], + unquoted_tables_are_lowercased: "SELECT a FROM B.C" => [r#""b"."c""#], + single_quotes_in_tables_are_converted_to_double_quotes: "SELECT a FROM 'B'.'C'" => [r#""B"."C""#], + double_quotes_in_tables_are_preserved: r#"SELECT a FROM "B"."C""# => [r#""B"."C""#], + backticks_in_tables_are_converted_to_double_quotes: "SELECT a FROM `B`.`C`" => [r#""B"."C""#], + } +} diff --git a/graph/src/amp/sql/query_builder/table_validator.rs b/graph/src/amp/sql/query_builder/table_validator.rs new file mode 100644 index 00000000000..c3aac82f2d3 --- /dev/null +++ b/graph/src/amp/sql/query_builder/table_validator.rs @@ -0,0 +1,99 @@ +use std::collections::BTreeSet; + +use anyhow::{bail, Result}; +use sqlparser_latest::ast; + +use super::{extract_tables, TableReference}; + +/// Validates that SQL query references only allowed dataset and tables. +/// +/// # Errors +/// +/// Returns an error if: +/// - The `query` does not reference any tables +/// - The `query` references a table not in `allowed_tables` +/// - The `query` references a dataset other than `allowed_dataset` +/// +/// The returned error is deterministic. +pub(super) fn validate_tables<'a>( + query: &ast::Query, + allowed_dataset: &str, + allowed_tables: impl IntoIterator, +) -> Result<()> { + let used_tables = extract_tables(query); + + if used_tables.is_empty() { + bail!("query does not use any tables"); + } + + let allowed_tables = allowed_tables + .into_iter() + .map(|allowed_table| TableReference::new(allowed_dataset, allowed_table)) + .collect::>(); + + for used_table in used_tables { + if !allowed_tables.contains(&used_table) { + bail!("table '{used_table}' not allowed"); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::super::parse_query; + use super::*; + + macro_rules! test_validate_tables { + ($($name:ident: $input:expr, $dataset:expr, $tables:expr => $expected:expr),* $(,)?) => { + $( + #[test] + fn $name() { + let query = parse_query($input).unwrap(); + let result = validate_tables(&query, $dataset, $tables); + + match $expected { + Result::<(), &str>::Ok(()) => { + result.unwrap(); + }, + Err(e) => { + assert_eq!(result.unwrap_err().to_string(), e); + } + } + } + )* + }; + } + + test_validate_tables! { + no_table_references: "SELECT *", "a", ["b"] => Err("query does not use any tables"), + missing_dataset: "SELECT * FROM b", "a", ["b"] => Err(r#"table '"b"' not allowed"#), + missing_table: "SELECT * FROM a", "a", ["b"] => Err(r#"table '"a"' not allowed"#), + invalid_dataset: "SELECT * FROM c.b", "a", ["b"] => Err(r#"table '"c"."b"' not allowed"#), + invalid_nested_dataset: "WITH a AS (SELECT * FROM c.b) SELECT * FROM a", "a", ["b"] => Err(r#"table '"c"."b"' not allowed"#), + invalid_table: "SELECT * FROM a.c", "a", ["b"] => Err(r#"table '"a"."c"' not allowed"#), + invalid_nested_table: "WITH a AS (SELECT * FROM a.c) SELECT * FROM a", "a", ["b"] => Err(r#"table '"a"."c"' not allowed"#), + using_catalog: "SELECT * FROM c.a.b", "a", ["b"] => Err(r#"table '"c"."a"."b"' not allowed"#), + + one_valid_table: "SELECT * FROM a.b", "a", ["b"] => Ok(()), + one_valid_nested_table: "WITH a AS (SELECT * FROM a.b) SELECT * FROM a", "a", ["b"] => Ok(()), + multiple_valid_tables: "SELECT * FROM a.b JOIN a.c ON a.c.c = a.b.b", "a", ["b", "c"] => Ok(()), + multiple_valid_nested_tables: "WITH a AS (SELECT * FROM a.b JOIN a.c ON a.c.c = a.b.b) SELECT * FROM a", "a", ["b", "c"] => Ok(()), + + unquoted_dataset_is_case_insensitive: "SELECT * FROM A.b", "a", ["b"] => Ok(()), + unquoted_tables_are_case_insensitive: "SELECT * FROM a.B", "a", ["b"] => Ok(()), + + single_quoted_dataset_is_case_sensitive: "SELECT * FROM 'A'.b", "a", ["b"] => Err(r#"table '"A"."b"' not allowed"#), + single_quoted_tables_are_case_sensitive: "SELECT * FROM a.'B'", "a", ["b"] => Err(r#"table '"a"."B"' not allowed"#), + + double_quoted_dataset_is_case_sensitive: r#"SELECT * FROM "A".b"#, "a", ["b"] => Err(r#"table '"A"."b"' not allowed"#), + double_quoted_tables_are_case_sensitive: r#"SELECT * FROM a."B""#, "a", ["b"] => Err(r#"table '"a"."B"' not allowed"#), + + backtick_quoted_dataset_is_case_sensitive: "SELECT * FROM `A`.b", "a", ["b"] => Err(r#"table '"A"."b"' not allowed"#), + backtick_quoted_tables_are_case_sensitive: "SELECT * FROM a.`B`", "a", ["b"] => Err(r#"table '"a"."B"' not allowed"#), + + allowed_dataset_is_case_sensitive: "SELECT * FROM a.b", "A", ["b"] => Err(r#"table '"a"."b"' not allowed"#), + allowed_tables_are_case_sensitive: "SELECT * FROM a.b", "a", ["B"] => Err(r#"table '"a"."b"' not allowed"#), + } +} diff --git a/graph/src/amp/stream_aggregator/error.rs b/graph/src/amp/stream_aggregator/error.rs new file mode 100644 index 00000000000..a2ba55f71e2 --- /dev/null +++ b/graph/src/amp/stream_aggregator/error.rs @@ -0,0 +1,51 @@ +use std::sync::Arc; + +use thiserror::Error; + +use crate::amp::error::IsDeterministic; + +#[derive(Debug, Error)] +pub enum Error { + #[error("failed to aggregate record batches: {0:#}")] + Aggregation(#[source] anyhow::Error), + + #[error("failed to buffer record batches from stream '{stream_name}': {source:#}")] + Buffer { + stream_name: Arc, + source: anyhow::Error, + }, + + #[error("failed to read record batch from stream '{stream_name}': {source:#}")] + Stream { + stream_name: Arc, + source: anyhow::Error, + is_deterministic: bool, + }, +} + +impl Error { + pub(super) fn stream(stream_name: Arc, e: E) -> Self + where + E: std::error::Error + IsDeterministic + Send + Sync + 'static, + { + let is_deterministic = e.is_deterministic(); + + Self::Stream { + stream_name, + source: anyhow::Error::from(e), + is_deterministic, + } + } +} + +impl IsDeterministic for Error { + fn is_deterministic(&self) -> bool { + match self { + Self::Aggregation(_) => true, + Self::Buffer { .. } => true, + Self::Stream { + is_deterministic, .. + } => *is_deterministic, + } + } +} diff --git a/graph/src/amp/stream_aggregator/mod.rs b/graph/src/amp/stream_aggregator/mod.rs new file mode 100644 index 00000000000..e2f0892252f --- /dev/null +++ b/graph/src/amp/stream_aggregator/mod.rs @@ -0,0 +1,231 @@ +mod error; +mod record_batch; + +use std::{ + pin::Pin, + sync::Arc, + task::{self, Poll}, +}; + +use anyhow::{anyhow, Result}; +use arrow::array::RecordBatch; +use futures03::{stream::BoxStream, Stream, StreamExt, TryStreamExt}; +use slog::{debug, info, Logger}; + +use self::record_batch::Buffer; +use crate::{ + amp::{client::ResponseBatch, error::IsDeterministic, log::Logger as _}, + cheap_clone::CheapClone, +}; + +pub use self::{ + error::Error, + record_batch::{RecordBatchGroup, RecordBatchGroups, StreamRecordBatch}, +}; + +/// Reads record batches from multiple streams and groups them by block number and hash pairs. +/// +/// Processes each row in the response record batches and groups them by block number +/// and hash. When processing starts for a new block, all data from previous blocks +/// is grouped and streamed in batches. +/// +/// The reason the aggregation is required is to ensure compatibility with the existing +/// subgraph storage implementation. +/// +/// # Stream requirements +/// +/// - Every record batch must have valid block number and hash columns +/// - Every record batch must contain blocks in ascending order +/// +/// # Performance +/// +/// To ensure data consistency and ordered output, the aggregator waits for slower streams +/// to catch up with faster streams. The output stream speed matches the slowest input stream. +pub struct StreamAggregator { + named_streams: Vec<(Arc, BoxStream<'static, Result>)>, + buffer: Buffer, + logger: Logger, + + /// Indicates whether all streams are fully consumed. + is_finalized: bool, + + /// Indicates whether any stream has produced an error. + /// + /// When `true`, the stream aggregator stops polling all other streams. + is_failed: bool, +} + +impl StreamAggregator { + /// Creates a new stream aggregator from the `streams` with a bounded buffer. + pub fn new( + logger: &Logger, + named_streams: impl IntoIterator>)>, + max_buffer_size: usize, + ) -> Self + where + E: std::error::Error + IsDeterministic + Send + Sync + 'static, + { + let logger = logger.component("AmpStreamAggregator"); + + let named_streams = named_streams + .into_iter() + .map(|(stream_name, stream)| { + let stream_name: Arc = stream_name.into(); + ( + stream_name.cheap_clone(), + stream + .map_err({ + let stream_name = stream_name.cheap_clone(); + move |e| Error::stream(stream_name.cheap_clone(), e) + }) + .try_filter_map({ + let stream_name = stream_name.cheap_clone(); + move |response_batch| { + let stream_name = stream_name.cheap_clone(); + async move { + match response_batch { + ResponseBatch::Batch { data } => Ok(Some(data)), + ResponseBatch::Reorg(_) => Err(Error::Stream { + stream_name: stream_name.cheap_clone(), + source: anyhow!("chain reorg"), + is_deterministic: false, + }), + } + } + } + }) + .boxed(), + ) + }) + .collect::>(); + + let num_streams = named_streams.len(); + + info!(logger, "Initializing stream aggregator"; + "num_streams" => num_streams, + "max_buffer_size" => max_buffer_size + ); + + Self { + named_streams, + buffer: Buffer::new(num_streams, max_buffer_size), + logger, + is_finalized: false, + is_failed: false, + } + } + + fn poll_all_streams( + &mut self, + cx: &mut task::Context<'_>, + ) -> Poll>> { + let mut made_progress = false; + + for (stream_index, (stream_name, stream)) in self.named_streams.iter_mut().enumerate() { + let logger = self.logger.new(slog::o!( + "stream_index" => stream_index, + "stream_name" => stream_name.cheap_clone() + )); + + if self.buffer.is_finalized(stream_index) { + continue; + } + + if self.buffer.is_blocked(stream_index) { + self.is_failed = true; + + return Poll::Ready(Some(Err(Error::Buffer { + stream_name: stream_name.cheap_clone(), + source: anyhow!("buffer is blocked"), + }))); + } + + if !self.buffer.has_capacity(stream_index) { + continue; + } + + match stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(record_batch))) if record_batch.num_rows() != 0 => { + let buffer_result = + self.buffer + .extend(stream_index, record_batch) + .map_err(|e| Error::Buffer { + stream_name: stream_name.cheap_clone(), + source: e, + }); + + match buffer_result { + Ok(()) => { + made_progress = true; + + debug!(logger, "Buffered record batch"; + "buffer_size" => self.buffer.size(stream_index), + "has_capacity" => self.buffer.has_capacity(stream_index) + ); + } + Err(e) => { + self.is_failed = true; + + return Poll::Ready(Some(Err(e))); + } + } + } + Poll::Ready(Some(Ok(_empty_record_batch))) => { + debug!(logger, "Received an empty record batch"); + } + Poll::Ready(Some(Err(e))) => { + self.is_failed = true; + + return Poll::Ready(Some(Err(e))); + } + Poll::Ready(None) => { + self.buffer.finalize(stream_index); + + if self.buffer.all_finalized() { + self.is_finalized = true; + } + + made_progress = true; + + info!(logger, "Stream completed"; + "buffer_size" => self.buffer.size(stream_index) + ); + } + Poll::Pending => { + // + } + } + } + + if made_progress { + if let Some(completed_groups) = + self.buffer.completed_groups().map_err(Error::Aggregation)? + { + debug!(self.logger, "Sending completed record batch groups"; + "num_completed_groups" => completed_groups.len() + ); + + return Poll::Ready(Some(Ok(completed_groups))); + } + } + + if self.is_finalized { + info!(self.logger, "All streams completed"); + return Poll::Ready(None); + } + + Poll::Pending + } +} + +impl Stream for StreamAggregator { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> Poll> { + if self.is_finalized || self.is_failed { + return Poll::Ready(None); + } + + self.poll_all_streams(cx) + } +} diff --git a/graph/src/amp/stream_aggregator/record_batch/aggregator.rs b/graph/src/amp/stream_aggregator/record_batch/aggregator.rs new file mode 100644 index 00000000000..f513a2752ed --- /dev/null +++ b/graph/src/amp/stream_aggregator/record_batch/aggregator.rs @@ -0,0 +1,230 @@ +use std::{ + collections::{btree_map::Entry, BTreeMap, HashSet}, + sync::{Arc, Weak}, +}; + +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{bail, Context, Result}; +use arrow::array::RecordBatch; + +use super::{Decoder, GroupData}; +use crate::cheap_clone::CheapClone; + +/// Groups record batches by block number and hash pairs. +/// +/// This aggregator collects and organizes record batches based on their +/// associated block identifiers. +pub(super) struct Aggregator { + buffer: BTreeMap<(BlockNumber, BlockHash), GroupData>, + buffered_record_batches: Vec>, + is_finalized: bool, +} + +impl Aggregator { + /// Creates a new empty aggregator. + pub(super) fn new() -> Self { + Self { + buffer: BTreeMap::new(), + buffered_record_batches: Vec::new(), + is_finalized: false, + } + } + + /// Extends this aggregator with data from a new `record_batch`. + /// + /// Processes each row in the `record_batch` and groups them by block number + /// and hash. Each unique block is stored in the internal buffer with references + /// to all rows that belong to that block. + /// + /// # Errors + /// + /// Returns an error if: + /// - `record_batch` does not contain block numbers or hashes + /// - `record_batch` contains invalid block numbers or hashes + /// - `record_batch` data is not ordered + /// - `record_batch` data is not consistent + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if this aggregator has already been finalized. + pub(super) fn extend(&mut self, record_batch: RecordBatch) -> Result<()> { + assert!(!self.is_finalized); + + let record_batch = Arc::new(record_batch); + let decoder = Decoder::new(&record_batch)?; + + self.buffered_record_batches + .push(Arc::downgrade(&record_batch)); + + let num_rows = record_batch.num_rows(); + let mut record_batch_buffered: HashSet<(BlockNumber, BlockHash)> = + HashSet::with_capacity(num_rows); + + for row_index in 0..num_rows { + let err_cx = || format!("invalid group data at row {row_index}"); + let block_number = decoder.block_number(row_index).with_context(err_cx)?; + let block_hash = decoder.block_hash(row_index).with_context(err_cx)?; + let block_ptr = (block_number, block_hash); + + self.ensure_incremental_update(&block_ptr) + .with_context(err_cx)?; + + match self.buffer.entry(block_ptr) { + Entry::Vacant(entry) => { + entry.insert(GroupData::new(record_batch.cheap_clone(), row_index)); + record_batch_buffered.insert(block_ptr); + } + Entry::Occupied(mut entry) => { + let group_data = entry.get_mut(); + + if !record_batch_buffered.contains(&block_ptr) { + group_data.add(record_batch.cheap_clone(), row_index); + record_batch_buffered.insert(block_ptr); + } else { + group_data.add_row_index(row_index); + } + } + } + } + + Ok(()) + } + + /// Returns the block number and hash pair for the most recent completed group. + /// + /// A group is considered complete when: + /// - There is a group with a higher block number in the internal buffer + /// - This aggregator is finalized + /// + /// Any group in this aggregator with a lower block number than the one returned by + /// this method is also considered complete. + pub(super) fn max_completed_block_ptr(&self) -> Option<&(BlockNumber, BlockHash)> { + let mut iter = self.buffer.keys().rev(); + + if self.is_finalized { + return iter.next(); + } + + iter.skip(1).next() + } + + /// Returns `true` if this aggregator contains completed groups. + /// + /// A group is considered complete when: + /// - There is a group with a higher block number in the internal buffer + /// - This aggregator is finalized + pub(super) fn has_completed_groups(&self) -> bool { + (self.is_finalized && !self.buffer.is_empty()) || self.buffer.len() > 1 + } + + /// Removes and returns completed groups from this aggregator up to `max_block_ptr`. + /// + /// # Errors + /// + /// Returns an error if groups cannot be converted into record batches. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if `max_block_ptr` is greater than the most recent completed block in this aggregator. + pub(super) fn completed_groups( + &mut self, + max_block_ptr: &(BlockNumber, BlockHash), + ) -> Result>> { + if self.buffer.is_empty() { + return Ok(None); + } + + let Some(max_completed_block_ptr) = self.max_completed_block_ptr() else { + return Ok(None); + }; + + assert!(max_block_ptr <= max_completed_block_ptr); + let incomplete_groups = self.buffer.split_off(max_block_ptr); + let mut completed_groups = std::mem::replace(&mut self.buffer, incomplete_groups); + + if let Some((block_ptr, _)) = self.buffer.first_key_value() { + if block_ptr == max_block_ptr { + let (block_ptr, group_data) = self.buffer.pop_first().unwrap(); + completed_groups.insert(block_ptr, group_data); + } + } + + if completed_groups.is_empty() { + return Ok(None); + } + + let completed_groups = completed_groups + .into_iter() + .map(|(block_ptr, group_data)| Ok((block_ptr, group_data.into_record_batch()?))) + .collect::>>()?; + + self.buffered_record_batches + .retain(|weak_ref| weak_ref.strong_count() > 0); + + Ok(Some(completed_groups)) + } + + /// Marks this aggregator as finalized. + /// + /// A finalized aggregator cannot be extended. + pub(super) fn finalize(&mut self) { + self.is_finalized = true; + } + + /// Returns `true` if this aggregator is finalized. + pub(super) fn is_finalized(&self) -> bool { + self.is_finalized + } + + /// Returns the number of record batches that this aggregator holds strong references to. + pub(super) fn len(&self) -> usize { + self.buffered_record_batches + .iter() + .filter(|weak_ref| weak_ref.strong_count() > 0) + .count() + } + + /// Ensures that block updates arrive in sequential order. + /// + /// Validates that the provided block number and hash represent a valid + /// incremental update relative to the last block in the buffer. + /// + /// # Errors + /// + /// Returns an error if: + /// - The block number is less than the maximum stored block number + /// - The block number equals the maximum but has a different hash + /// + /// The returned error is deterministic. + /// + /// # Note + /// + /// Potential reorgs are not handled at this level and are + /// treated as data corruption. + fn ensure_incremental_update( + &self, + (block_number, block_hash): &(BlockNumber, BlockHash), + ) -> Result<()> { + let Some(((max_block_number, max_block_hash), _)) = self.buffer.last_key_value() else { + return Ok(()); + }; + + if block_number < max_block_number { + bail!("received block number {block_number} after {max_block_number}"); + } + + if block_number == max_block_number && block_hash != max_block_hash { + bail!( + "received block hash '0x{}' after '0x{}' for block number {block_number}", + hex::encode(&block_hash), + hex::encode(&max_block_hash) + ); + } + + Ok(()) + } +} diff --git a/graph/src/amp/stream_aggregator/record_batch/buffer.rs b/graph/src/amp/stream_aggregator/record_batch/buffer.rs new file mode 100644 index 00000000000..4b45680636c --- /dev/null +++ b/graph/src/amp/stream_aggregator/record_batch/buffer.rs @@ -0,0 +1,209 @@ +use std::collections::{btree_map::Entry, BTreeMap}; + +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{bail, Result}; +use arrow::array::RecordBatch; + +use super::{Aggregator, RecordBatchGroup, RecordBatchGroups, StreamRecordBatch}; + +/// Buffers record batches from multiple streams in memory and creates +/// groups of record batches by block number and hash pairs. +pub(in super::super) struct Buffer { + aggregators: Vec, + num_streams: usize, + max_buffer_size: usize, +} + +impl Buffer { + /// Creates a new buffer that can handle exactly `num_streams` number of streams. + /// + /// Creates a new associated `Aggregator` for each stream. + /// The `max_buffer_size` specifies how many record batches for each stream can be buffered at most. + pub(in super::super) fn new(num_streams: usize, max_buffer_size: usize) -> Self { + let aggregators = (0..num_streams).map(|_| Aggregator::new()).collect(); + + Self { + aggregators, + num_streams, + max_buffer_size, + } + } + + /// Extends the aggregator for `stream_index` with data from a new `record_batch`. + /// + /// # Errors + /// + /// Errors if the aggregator cannot be extended. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn extend( + &mut self, + stream_index: usize, + record_batch: RecordBatch, + ) -> Result<()> { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].extend(record_batch) + } + + /// Removes and returns all completed groups from this buffer. + /// + /// # Errors + /// + /// Errors if aggregators fail to return completed groups. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if aggregators return inconsistent responses. + pub(in super::super) fn completed_groups(&mut self) -> Result> { + let Some(max_completed_block_ptr) = self.max_completed_block_ptr()? else { + return Ok(None); + }; + + let mut ordered_completed_groups = BTreeMap::new(); + + for (stream_index, agg) in self.aggregators.iter_mut().enumerate() { + let Some(completed_groups) = agg.completed_groups(&max_completed_block_ptr)? else { + continue; + }; + + for (block_ptr, record_batch) in completed_groups { + match ordered_completed_groups.entry(block_ptr) { + Entry::Vacant(entry) => { + entry.insert(RecordBatchGroup { + record_batches: vec![StreamRecordBatch { + stream_index, + record_batch, + }], + }); + } + Entry::Occupied(mut entry) => { + entry.get_mut().record_batches.push(StreamRecordBatch { + stream_index, + record_batch, + }); + } + } + } + } + + assert!(!ordered_completed_groups.is_empty()); + Ok(Some(ordered_completed_groups)) + } + + /// Marks the aggregator for the `stream_index` as finalized. + /// + /// A finalized aggregator cannot be extended. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn finalize(&mut self, stream_index: usize) { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].finalize(); + } + + /// Returns `true` if the aggregator for `stream_index` is finalized. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn is_finalized(&self, stream_index: usize) -> bool { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].is_finalized() + } + + /// Returns `true` if all aggregators are finalized. + pub(in super::super) fn all_finalized(&self) -> bool { + self.aggregators.iter().all(|agg| agg.is_finalized()) + } + + /// Returns `true` if the aggregator for `stream_index` can be extended. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn has_capacity(&self, stream_index: usize) -> bool { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].len() < self.max_buffer_size + } + + /// Returns `true` if the stream `stream_index` is not allowed to make progress and + /// its aggregator does not contain any completed groups. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn is_blocked(&self, stream_index: usize) -> bool { + !self.has_capacity(stream_index) + && !self.is_finalized(stream_index) + && !self.aggregators[stream_index].has_completed_groups() + } + + /// Returns the number of record batches stream `stream_index` has buffered. + /// + /// # Panics + /// + /// Panics if the `stream_index` is greater than the initialized number of streams. + pub(in super::super) fn size(&self, stream_index: usize) -> usize { + assert!(stream_index < self.num_streams); + self.aggregators[stream_index].len() + } + + /// Returns the block number and hash pair for the most recent completed group across all streams. + /// + /// Finds the highest block number that all streams have completed. This ensures + /// slower streams can still produce valid completed groups without skipping any groups. + /// The function returns the minimum of all maximum completed blocks to maintain consistency. + /// + /// # Errors + /// + /// Returns an error if multiple streams return the same block number but different hashes. + /// + /// The returned error is deterministic. + /// + /// # Note + /// + /// Potential reorgs are not handled at this level and are treated as data corruption. + fn max_completed_block_ptr(&self) -> Result> { + let mut max_completed_block_ptrs: BTreeMap<&BlockNumber, &BlockHash> = BTreeMap::new(); + + for (stream_index, agg) in self.aggregators.iter().enumerate() { + let Some((max_completed_block_number, max_completed_block_hash)) = + agg.max_completed_block_ptr() + else { + if !agg.is_finalized() { + return Ok(None); + } + + continue; + }; + + match max_completed_block_ptrs.entry(max_completed_block_number) { + Entry::Vacant(entry) => { + entry.insert(max_completed_block_hash); + } + Entry::Occupied(entry) => { + if *entry.get() != max_completed_block_hash { + bail!("aggregated data is corrupted: stream {} produced block hash '0x{}' for block {}, but a previous stream set the block hash to '0x{}'", + stream_index, + hex::encode(max_completed_block_hash), + max_completed_block_number, + hex::encode(entry.get()), + ); + } + } + }; + } + + Ok(max_completed_block_ptrs + .into_iter() + .next() + .map(|(block_number, block_hash)| (*block_number, *block_hash))) + } +} diff --git a/graph/src/amp/stream_aggregator/record_batch/decoder.rs b/graph/src/amp/stream_aggregator/record_batch/decoder.rs new file mode 100644 index 00000000000..a2c5cf92daf --- /dev/null +++ b/graph/src/amp/stream_aggregator/record_batch/decoder.rs @@ -0,0 +1,62 @@ +use alloy::primitives::{BlockHash, BlockNumber}; +use anyhow::{anyhow, Result}; +use arrow::array::RecordBatch; + +use crate::amp::codec::{ + self, + utils::{auto_block_hash_decoder, auto_block_number_decoder}, +}; + +/// Decodes the data required for stream aggregation. +pub(super) struct Decoder<'a> { + /// Block numbers serve as group keys for related record batches. + block_number: Box> + 'a>, + + /// Block hashes ensure data consistency across tables and datasets. + block_hash: Box> + 'a>, +} + +impl<'a> Decoder<'a> { + /// Constructs a new decoder for `record_batch`. + /// + /// # Errors + /// + /// Returns an error if: + /// - `record_batch` does not contain valid block number or hash columns + /// + /// The returned error is deterministic. + pub(super) fn new(record_batch: &'a RecordBatch) -> Result { + Ok(Self { + block_number: auto_block_number_decoder(record_batch)?.1, + block_hash: auto_block_hash_decoder(record_batch)?.1, + }) + } + + /// Returns the block number at `row_index`. + /// + /// # Errors + /// + /// Returns an error if: + /// - The block number at `row_index` is null + /// + /// The returned error is deterministic. + pub(super) fn block_number(&self, row_index: usize) -> Result { + self.block_number + .decode(row_index)? + .ok_or_else(|| anyhow!("block number is empty")) + } + + /// Returns the block hash at `row_index`. + /// + /// # Errors + /// + /// Returns an error if: + /// - The block hash at `row_index` is null or invalid + /// + /// The returned error is deterministic. + pub(super) fn block_hash(&self, row_index: usize) -> Result { + self.block_hash + .decode(row_index)? + .ok_or_else(|| anyhow!("block hash is empty")) + } +} diff --git a/graph/src/amp/stream_aggregator/record_batch/group_data.rs b/graph/src/amp/stream_aggregator/record_batch/group_data.rs new file mode 100644 index 00000000000..32d3317c585 --- /dev/null +++ b/graph/src/amp/stream_aggregator/record_batch/group_data.rs @@ -0,0 +1,88 @@ +use std::sync::Arc; + +use anyhow::{Context, Result}; +use arrow::{ + array::{RecordBatch, UInt64Array}, + compute::{concat_batches, take_record_batch}, +}; + +/// Contains references to all record batches and rows of a group. +pub(super) struct GroupData { + parts: Vec, +} + +struct Part { + record_batch: Arc, + row_indices: Vec, +} + +impl GroupData { + /// Creates a new group with an initial `record_batch` and `row_index`. + pub(super) fn new(record_batch: Arc, row_index: usize) -> Self { + Self { + parts: vec![Part { + record_batch, + row_indices: vec![row_index as u64], + }], + } + } + + /// Adds a new `record_batch` and `row_index` to this group. + pub(super) fn add(&mut self, record_batch: Arc, row_index: usize) { + self.parts.push(Part { + record_batch, + row_indices: vec![row_index as u64], + }) + } + + /// Adds a `row_index` to the most recent record batch in this group. + /// + /// # Panics + /// + /// Panics if this group is empty. + pub(super) fn add_row_index(&mut self, row_index: usize) { + assert!(!self.parts.is_empty()); + + self.parts + .last_mut() + .unwrap() + .row_indices + .push(row_index as u64); + } + + /// Converts this group into a single record batch. + /// + /// Merges all group rows from all record batches together. + /// + /// # Errors + /// + /// Returns an error if the record batches in this group have incompatible types. + /// + /// The returned error is deterministic. + /// + /// # Panics + /// + /// Panics if: + /// - This group is empty + /// - This group contains invalid row indices + pub(super) fn into_record_batch(self) -> Result { + assert!(!self.parts.is_empty()); + + let schema = self.parts[0].record_batch.schema(); + let mut partial_record_batches = Vec::with_capacity(self.parts.len()); + + for part in self.parts { + let Part { + record_batch, + row_indices, + } = part; + + let row_indices = UInt64Array::from(row_indices); + let partial_record_batch = take_record_batch(&record_batch, &row_indices).unwrap(); + + partial_record_batches.push(partial_record_batch); + } + + concat_batches(&schema, &partial_record_batches).context("failed to merge record batches") + } +} diff --git a/graph/src/amp/stream_aggregator/record_batch/mod.rs b/graph/src/amp/stream_aggregator/record_batch/mod.rs new file mode 100644 index 00000000000..171f360f5fa --- /dev/null +++ b/graph/src/amp/stream_aggregator/record_batch/mod.rs @@ -0,0 +1,38 @@ +//! This module handles grouping record batches from multiple streams. +//! +//! # Safety +//! +//! The implementation occasionally uses `assert` and `unwrap` to ensure consistency +//! between related types and methods. +//! +//! This is safe because the functionality is internal and not exposed to other modules. +//! +//! A panic indicates a critical error in the grouping algorithm. + +mod aggregator; +mod buffer; +mod decoder; +mod group_data; + +use std::collections::BTreeMap; + +use alloy::primitives::{BlockHash, BlockNumber}; +use arrow::array::RecordBatch; + +use self::{aggregator::Aggregator, decoder::Decoder, group_data::GroupData}; + +pub(super) use buffer::Buffer; + +/// Maps block number and hash pairs to record batches. +pub type RecordBatchGroups = BTreeMap<(BlockNumber, BlockHash), RecordBatchGroup>; + +/// Contains record batches associated with a specific block number and hash pair. +pub struct RecordBatchGroup { + pub record_batches: Vec, +} + +/// Contains a record batch and the index of its source stream. +pub struct StreamRecordBatch { + pub stream_index: usize, + pub record_batch: RecordBatch, +} diff --git a/graph/src/blockchain/mod.rs b/graph/src/blockchain/mod.rs index 7768ea7f6e9..1346213b879 100644 --- a/graph/src/blockchain/mod.rs +++ b/graph/src/blockchain/mod.rs @@ -595,6 +595,7 @@ impl FromStr for BlockchainKind { "near" => Ok(BlockchainKind::Near), "substreams" => Ok(BlockchainKind::Substreams), "subgraph" => Ok(BlockchainKind::Ethereum), // TODO(krishna): We should detect the blockchain kind from the source subgraph + "amp" => Ok(BlockchainKind::Ethereum), // TODO: Maybe get this from the Amp server _ => Err(anyhow!("unknown blockchain kind {}", s)), } } diff --git a/graph/src/blockchain/types.rs b/graph/src/blockchain/types.rs index 081fff4eea5..c64da4f4f7a 100644 --- a/graph/src/blockchain/types.rs +++ b/graph/src/blockchain/types.rs @@ -611,6 +611,12 @@ impl From for BlockTime { } } +impl From for BlockTime { + fn from(value: Timestamp) -> Self { + Self(value) + } +} + impl From for Value { fn from(block_time: BlockTime) -> Self { Value::Timestamp(block_time.0) diff --git a/graph/src/cheap_clone.rs b/graph/src/cheap_clone.rs index b8863d3918e..fc9c98ab7d1 100644 --- a/graph/src/cheap_clone.rs +++ b/graph/src/cheap_clone.rs @@ -106,6 +106,7 @@ cheap_clone_is_clone!(Channel); // reqwest::Client uses Arc internally, so it is CheapClone. cheap_clone_is_clone!(reqwest::Client); cheap_clone_is_clone!(slog::Logger); +cheap_clone_is_clone!(semver::Version); cheap_clone_is_copy!( (), @@ -119,3 +120,5 @@ cheap_clone_is_copy!( std::time::Duration ); cheap_clone_is_copy!(ethabi::Address); + +cheap_clone_is_clone!(tokio_util::sync::CancellationToken); diff --git a/graph/src/components/metrics/registry.rs b/graph/src/components/metrics/registry.rs index 93cf51b3bd1..b41f27bc785 100644 --- a/graph/src/components/metrics/registry.rs +++ b/graph/src/components/metrics/registry.rs @@ -1,8 +1,8 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use prometheus::IntGauge; use prometheus::{labels, Histogram, IntCounterVec}; +use prometheus::{IntCounter, IntGauge}; use slog::debug; use crate::components::metrics::{counter_with_labels, gauge_with_labels}; @@ -349,6 +349,23 @@ impl MetricsRegistry { Ok(counter) } + pub fn new_int_counter( + &self, + name: impl AsRef, + help: impl AsRef, + const_labels: impl IntoIterator, + ) -> Result { + let opts = Opts::new(name.as_ref(), help.as_ref()).const_labels( + const_labels + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + ); + let int_counter = IntCounter::with_opts(opts)?; + self.register(name.as_ref(), Box::new(int_counter.clone())); + Ok(int_counter) + } + pub fn new_counter_with_labels( &self, name: &str, @@ -500,12 +517,12 @@ impl MetricsRegistry { &self, name: impl AsRef, help: impl AsRef, - const_labels: impl IntoIterator, impl Into)>, + const_labels: impl IntoIterator, ) -> Result { let opts = Opts::new(name.as_ref(), help.as_ref()).const_labels( const_labels .into_iter() - .map(|(a, b)| (a.into(), b.into())) + .map(|(key, value)| (key.to_string(), value.to_string())) .collect(), ); let gauge = IntGauge::with_opts(opts)?; diff --git a/graph/src/components/store/err.rs b/graph/src/components/store/err.rs index 446b73408f1..d59a835d57b 100644 --- a/graph/src/components/store/err.rs +++ b/graph/src/components/store/err.rs @@ -247,3 +247,9 @@ impl From for StoreError { StoreError::Unknown(anyhow!("{}", e.to_string())) } } + +impl crate::amp::error::IsDeterministic for StoreError { + fn is_deterministic(&self) -> bool { + StoreError::is_deterministic(self) + } +} diff --git a/graph/src/components/subgraph/instance_manager.rs b/graph/src/components/subgraph/instance_manager.rs index c9f076a2a36..d014366ead2 100644 --- a/graph/src/components/subgraph/instance_manager.rs +++ b/graph/src/components/subgraph/instance_manager.rs @@ -1,4 +1,3 @@ -use crate::prelude::BlockNumber; use std::sync::Arc; use crate::components::store::DeploymentLocator; @@ -10,10 +9,7 @@ use crate::components::store::DeploymentLocator; /// subgraph instance manager stops and removes the corresponding instance. #[async_trait::async_trait] pub trait SubgraphInstanceManager: Send + Sync + 'static { - async fn start_subgraph( - self: Arc, - deployment: DeploymentLocator, - stop_block: Option, - ); - async fn stop_subgraph(&self, deployment: DeploymentLocator); + async fn start_subgraph(self: Arc, loc: DeploymentLocator, stop_block: Option); + + async fn stop_subgraph(&self, loc: DeploymentLocator); } diff --git a/graph/src/components/subgraph/mod.rs b/graph/src/components/subgraph/mod.rs index 5bdea73ca45..02b6486b953 100644 --- a/graph/src/components/subgraph/mod.rs +++ b/graph/src/components/subgraph/mod.rs @@ -2,7 +2,6 @@ mod host; mod instance; mod instance_manager; mod proof_of_indexing; -mod provider; mod registrar; mod settings; @@ -15,6 +14,5 @@ pub use self::proof_of_indexing::{ PoICausalityRegion, ProofOfIndexing, ProofOfIndexingEvent, ProofOfIndexingFinisher, ProofOfIndexingVersion, SharedProofOfIndexing, }; -pub use self::provider::SubgraphAssignmentProvider; pub use self::registrar::{SubgraphRegistrar, SubgraphVersionSwitchingMode}; pub use self::settings::{Setting, Settings}; diff --git a/graph/src/components/subgraph/provider.rs b/graph/src/components/subgraph/provider.rs deleted file mode 100644 index 3e33f6fd5bf..00000000000 --- a/graph/src/components/subgraph/provider.rs +++ /dev/null @@ -1,10 +0,0 @@ -use async_trait::async_trait; - -use crate::{components::store::DeploymentLocator, prelude::*}; - -/// Common trait for subgraph providers. -#[async_trait] -pub trait SubgraphAssignmentProvider: Send + Sync + 'static { - async fn start(&self, deployment: DeploymentLocator, stop_block: Option); - async fn stop(&self, deployment: DeploymentLocator); -} diff --git a/graph/src/data/store/scalar/bigdecimal.rs b/graph/src/data/store/scalar/bigdecimal.rs index b8b62f573fb..65738563a67 100644 --- a/graph/src/data/store/scalar/bigdecimal.rs +++ b/graph/src/data/store/scalar/bigdecimal.rs @@ -138,12 +138,24 @@ impl From for BigDecimal { } } +impl From for BigDecimal { + fn from(n: i128) -> Self { + Self::from(OldBigDecimal::new(BigInt::from(n).inner(), 0)) + } +} + impl From for BigDecimal { fn from(n: u64) -> Self { Self::from(OldBigDecimal::from(n)) } } +impl From for BigDecimal { + fn from(n: f32) -> Self { + Self::from(OldBigDecimal::from_f32(n).unwrap_or_default()) + } +} + impl From for BigDecimal { fn from(n: f64) -> Self { Self::from(OldBigDecimal::from_f64(n).unwrap_or_default()) diff --git a/graph/src/data/store/scalar/bigint.rs b/graph/src/data/store/scalar/bigint.rs index c344ec83a6d..554aac83d6b 100644 --- a/graph/src/data/store/scalar/bigint.rs +++ b/graph/src/data/store/scalar/bigint.rs @@ -224,14 +224,20 @@ impl BigInt { } } -impl From for BigInt { - fn from(i: i32) -> BigInt { +impl From for BigInt { + fn from(i: i8) -> BigInt { BigInt::unchecked_new(i.into()) } } -impl From for BigInt { - fn from(i: u64) -> BigInt { +impl From for BigInt { + fn from(i: i16) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: i32) -> BigInt { BigInt::unchecked_new(i.into()) } } @@ -242,6 +248,36 @@ impl From for BigInt { } } +impl From for BigInt { + fn from(i: i128) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u8) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u16) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u32) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + +impl From for BigInt { + fn from(i: u64) -> BigInt { + BigInt::unchecked_new(i.into()) + } +} + impl From for BigInt { /// This implementation assumes that U64 represents an unsigned U64, /// and not a signed U64 (aka int64 in Solidity). Right now, this is diff --git a/graph/src/data/store/scalar/timestamp.rs b/graph/src/data/store/scalar/timestamp.rs index 02769d4adf8..58b2ef10cb8 100644 --- a/graph/src/data/store/scalar/timestamp.rs +++ b/graph/src/data/store/scalar/timestamp.rs @@ -90,6 +90,12 @@ impl stable_hash_legacy::StableHash for Timestamp { } } +impl From> for Timestamp { + fn from(value: DateTime) -> Self { + Self(value) + } +} + impl Display for Timestamp { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { write!(f, "{}", self.as_microseconds_since_epoch()) diff --git a/graph/src/data/subgraph/api_version.rs b/graph/src/data/subgraph/api_version.rs index dad1469c7b4..163909adabb 100644 --- a/graph/src/data/subgraph/api_version.rs +++ b/graph/src/data/subgraph/api_version.rs @@ -66,8 +66,11 @@ pub const SPEC_VERSION_1_3_0: Version = Version::new(1, 3, 0); // Enables struct field access in declarative calls pub const SPEC_VERSION_1_4_0: Version = Version::new(1, 4, 0); +// Enables support for Amp data sources; +pub const SPEC_VERSION_1_5_0: Version = Version::new(1, 5, 0); + // The latest spec version available -pub const LATEST_VERSION: &Version = &SPEC_VERSION_1_4_0; +pub const LATEST_VERSION: &Version = &SPEC_VERSION_1_5_0; pub const MIN_SPEC_VERSION: Version = Version::new(0, 0, 2); diff --git a/graph/src/data/subgraph/mod.rs b/graph/src/data/subgraph/mod.rs index 25287a94e95..6d893be55cc 100644 --- a/graph/src/data/subgraph/mod.rs +++ b/graph/src/data/subgraph/mod.rs @@ -12,7 +12,7 @@ pub use features::{SubgraphFeature, SubgraphFeatureValidationError}; use crate::{cheap_clone::CheapClone, components::store::BLOCK_NUMBER_MAX, object}; use anyhow::{anyhow, Context, Error}; -use futures03::{future::try_join, stream::FuturesOrdered, TryStreamExt as _}; +use futures03::future::try_join_all; use itertools::Itertools; use semver::Version; use serde::{ @@ -32,7 +32,7 @@ use wasmparser; use web3::types::Address; use crate::{ - bail, + amp, bail, blockchain::{BlockPtr, Blockchain}, components::{ link_resolver::{LinkResolver, LinkResolverContext}, @@ -363,6 +363,8 @@ pub enum SubgraphManifestValidationError { FeatureValidationError(#[from] SubgraphFeatureValidationError), #[error("data source {0} is invalid: {1}")] DataSourceValidation(String, Error), + #[error("failed to validate Amp subgraph: {0:#}")] + Amp(#[source] Error), } #[derive(Error, Debug)] @@ -719,7 +721,7 @@ impl<'de> de::Deserialize<'de> for Prune { /// SubgraphManifest with IPFS links unresolved pub type UnresolvedSubgraphManifest = BaseSubgraphManifest< C, - UnresolvedSchema, + Option, UnresolvedDataSource, UnresolvedDataSourceTemplate, >; @@ -802,15 +804,24 @@ impl UnvalidatedSubgraphManifest { /// Entry point for resolving a subgraph definition. /// Right now the only supported links are of the form: /// `/ipfs/QmUmg7BZC1YP1ca66rRtWKxpXp77WgVHrnv263JtDuvs2k` - pub async fn resolve( + pub async fn resolve( id: DeploymentHash, raw: serde_yaml::Mapping, resolver: &Arc, + amp_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result { Ok(Self( - SubgraphManifest::resolve_from_raw(id, raw, resolver, logger, max_spec_version).await?, + SubgraphManifest::resolve_from_raw( + id, + raw, + resolver, + amp_client, + logger, + max_spec_version, + ) + .await?, )) } @@ -875,6 +886,8 @@ impl UnvalidatedSubgraphManifest { &self.0.spec_version, )); + errors.append(&mut Self::validate_amp_subgraph(&self.0)); + match errors.is_empty() { true => Ok(self.0), false => Err(errors), @@ -884,20 +897,79 @@ impl UnvalidatedSubgraphManifest { pub fn spec_version(&self) -> &Version { &self.0.spec_version } + + fn validate_amp_subgraph( + manifest: &SubgraphManifest, + ) -> Vec { + use api_version::SPEC_VERSION_1_5_0; + + let BaseSubgraphManifest { + id: _, + spec_version, + features, + description: _, + repository: _, + schema: _, + data_sources, + graft, + templates, + chain: _, + indexer_hints: _, + } = manifest; + + let amp_data_sources = data_sources + .iter() + .filter_map(|data_source| match data_source { + DataSource::Amp(amp_data_source) => Some(amp_data_source), + _ => None, + }) + .collect_vec(); + + if amp_data_sources.is_empty() { + // Not an Amp subgraph + return Vec::new(); + } + + let mut errors = Vec::new(); + let err = |msg: &str| SubgraphManifestValidationError::Amp(anyhow!(msg.to_owned())); + + if data_sources.len() != amp_data_sources.len() { + errors.push(err("multiple data source kinds are not supported")); + } + + if *spec_version < SPEC_VERSION_1_5_0 { + errors.push(err("spec version is not supported")); + } + + if !features.is_empty() { + errors.push(err("manifest features are not supported")); + } + + if graft.is_some() { + errors.push(err("grafting is not supported")); + } + + if !templates.is_empty() { + errors.push(err("data source templates are not supported")); + } + + errors + } } impl SubgraphManifest { /// Entry point for resolving a subgraph definition. - pub async fn resolve_from_raw( + pub async fn resolve_from_raw( id: DeploymentHash, raw: serde_yaml::Mapping, resolver: &Arc, + amp_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result { let unresolved = UnresolvedSubgraphManifest::parse(id.cheap_clone(), raw)?; let resolved = unresolved - .resolve(&id, resolver, logger, max_spec_version) + .resolve(&id, resolver, amp_client, logger, max_spec_version) .await?; Ok(resolved) } @@ -1033,10 +1105,11 @@ impl UnresolvedSubgraphManifest { serde_yaml::from_value(raw.into()).map_err(Into::into) } - pub async fn resolve( + pub async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, + amp_client: Option>, logger: &Logger, max_spec_version: semver::Version, ) -> Result, SubgraphManifestResolveError> { @@ -1046,7 +1119,7 @@ impl UnresolvedSubgraphManifest { features, description, repository, - schema, + schema: unresolved_schema, data_sources, graft, templates, @@ -1064,46 +1137,77 @@ impl UnresolvedSubgraphManifest { ).into()); } - let ds_count = data_sources.len(); - if ds_count as u64 + templates.len() as u64 > u32::MAX as u64 { + if data_sources.len() + templates.len() > u32::MAX as usize { return Err( - anyhow!("Subgraph has too many declared data sources and templates",).into(), + anyhow!("subgraph has too many declared data sources and templates").into(), ); } - let schema = schema - .resolve(&id, &spec_version, id.clone(), resolver, logger) - .await?; + let data_sources = try_join_all(data_sources.into_iter().enumerate().map(|(idx, ds)| { + ds.resolve( + deployment_hash, + resolver, + amp_client.cheap_clone(), + logger, + idx as u32, + &spec_version, + ) + })) + .await?; - let (data_sources, templates) = try_join( - data_sources - .into_iter() - .enumerate() - .map(|(idx, ds)| { - ds.resolve(deployment_hash, resolver, logger, idx as u32, &spec_version) - }) - .collect::>() - .try_collect::>(), - templates - .into_iter() - .enumerate() - .map(|(idx, template)| { - template.resolve( + let amp_data_sources = data_sources + .iter() + .filter_map(|data_source| match data_source { + DataSource::Amp(amp_data_source) => Some(amp_data_source), + _ => None, + }) + .collect_vec(); + + let schema = match unresolved_schema { + Some(unresolved_schema) => { + unresolved_schema + .resolve( deployment_hash, + &spec_version, + id.cheap_clone(), resolver, - &schema, logger, - ds_count as u32 + idx as u32, - &spec_version, ) - }) - .collect::>() - .try_collect::>(), - ) + .await? + } + None if amp_data_sources.len() == data_sources.len() => { + let table_schemas = amp_data_sources + .iter() + .map(|data_source| { + data_source + .transformer + .tables + .iter() + .map(|table| (table.name.clone(), table.schema.clone())) + }) + .flatten(); + + amp::schema::generate_subgraph_schema(&id, table_schemas)? + } + None => { + return Err(anyhow!("subgraph schema is required").into()); + } + }; + + let templates = try_join_all(templates.into_iter().enumerate().map(|(idx, template)| { + template.resolve( + &id, + resolver, + &schema, + logger, + data_sources.len() as u32 + idx as u32, + &spec_version, + ) + })) .await?; let is_substreams = data_sources.iter().any(|ds| ds.kind() == SUBSTREAMS_KIND); - if is_substreams && ds_count > 1 { + if is_substreams && data_sources.len() > 1 { return Err(anyhow!( "A Substreams-based subgraph can only contain a single data source." ) @@ -1174,7 +1278,7 @@ impl UnresolvedSubgraphManifest { ); } - Ok(SubgraphManifest { + let manifest = SubgraphManifest { id, spec_version, features, @@ -1186,7 +1290,16 @@ impl UnresolvedSubgraphManifest { templates, chain, indexer_hints, - }) + }; + + if let Some(e) = UnvalidatedSubgraphManifest::::validate_amp_subgraph(&manifest) + .into_iter() + .next() + { + return Err(anyhow::Error::from(e).into()); + } + + Ok(manifest) } } diff --git a/graph/src/data_source/mod.rs b/graph/src/data_source/mod.rs index e7fc22228ea..e1598e2f0df 100644 --- a/graph/src/data_source/mod.rs +++ b/graph/src/data_source/mod.rs @@ -25,7 +25,7 @@ use crate::{ prelude::{CheapClone as _, DataSourceContext}, schema::{EntityType, InputSchema}, }; -use anyhow::Error; +use anyhow::{anyhow, Context, Error}; use semver::Version; use serde::{de::IntoDeserializer as _, Deserialize, Deserializer}; use slog::{Logger, SendSyncRefUnwindSafeKV}; @@ -36,11 +36,14 @@ use std::{ }; use thiserror::Error; +use crate::amp; + #[derive(Debug)] pub enum DataSource { Onchain(C::DataSource), Offchain(offchain::DataSource), Subgraph(subgraph::DataSource), + Amp(amp::manifest::DataSource), } #[derive(Error, Debug)] @@ -96,6 +99,7 @@ impl DataSource { Self::Onchain(ds) => Some(ds), Self::Offchain(_) => None, Self::Subgraph(_) => None, + Self::Amp(_) => None, } } @@ -104,6 +108,7 @@ impl DataSource { Self::Onchain(_) => None, Self::Offchain(_) => None, Self::Subgraph(ds) => Some(ds), + Self::Amp(_) => None, } } @@ -112,6 +117,7 @@ impl DataSource { Self::Onchain(_) => true, Self::Offchain(_) => false, Self::Subgraph(_) => true, + Self::Amp(_) => true, } } @@ -120,6 +126,7 @@ impl DataSource { Self::Onchain(_) => None, Self::Offchain(ds) => Some(ds), Self::Subgraph(_) => None, + Self::Amp(_) => None, } } @@ -128,6 +135,7 @@ impl DataSource { DataSourceEnum::Onchain(ds) => ds.network(), DataSourceEnum::Offchain(_) => None, DataSourceEnum::Subgraph(ds) => ds.network(), + Self::Amp(ds) => Some(&ds.network), } } @@ -136,6 +144,7 @@ impl DataSource { DataSourceEnum::Onchain(ds) => Some(ds.start_block()), DataSourceEnum::Offchain(_) => None, DataSourceEnum::Subgraph(ds) => Some(ds.source.start_block), + Self::Amp(ds) => Some(ds.source.start_block as i32), } } @@ -152,6 +161,7 @@ impl DataSource { Self::Onchain(ds) => ds.address().map(ToOwned::to_owned), Self::Offchain(ds) => ds.address(), Self::Subgraph(ds) => ds.address(), + Self::Amp(ds) => Some(ds.source.address.to_vec()), } } @@ -160,6 +170,7 @@ impl DataSource { Self::Onchain(ds) => ds.name(), Self::Offchain(ds) => &ds.name, Self::Subgraph(ds) => &ds.name, + Self::Amp(ds) => ds.name.as_str(), } } @@ -168,6 +179,7 @@ impl DataSource { Self::Onchain(ds) => ds.kind().to_owned(), Self::Offchain(ds) => ds.kind.to_string(), Self::Subgraph(ds) => ds.kind.clone(), + Self::Amp(_) => amp::manifest::DataSource::KIND.to_string(), } } @@ -176,6 +188,7 @@ impl DataSource { Self::Onchain(ds) => ds.min_spec_version(), Self::Offchain(ds) => ds.min_spec_version(), Self::Subgraph(ds) => ds.min_spec_version(), + Self::Amp(_) => amp::manifest::DataSource::MIN_SPEC_VERSION, } } @@ -184,6 +197,7 @@ impl DataSource { Self::Onchain(ds) => ds.end_block(), Self::Offchain(_) => None, Self::Subgraph(_) => None, + Self::Amp(ds) => Some(ds.source.end_block as i32), } } @@ -192,6 +206,7 @@ impl DataSource { Self::Onchain(ds) => ds.creation_block(), Self::Offchain(ds) => ds.creation_block, Self::Subgraph(ds) => ds.creation_block, + Self::Amp(_) => None, } } @@ -200,6 +215,7 @@ impl DataSource { Self::Onchain(ds) => ds.context(), Self::Offchain(ds) => ds.context.clone(), Self::Subgraph(ds) => ds.context.clone(), + Self::Amp(_) => Arc::new(None), } } @@ -208,6 +224,7 @@ impl DataSource { Self::Onchain(ds) => ds.api_version(), Self::Offchain(ds) => ds.mapping.api_version.clone(), Self::Subgraph(ds) => ds.mapping.api_version.clone(), + Self::Amp(ds) => ds.transformer.api_version.clone(), } } @@ -216,6 +233,7 @@ impl DataSource { Self::Onchain(ds) => ds.runtime(), Self::Offchain(ds) => Some(ds.mapping.runtime.cheap_clone()), Self::Subgraph(ds) => Some(ds.mapping.runtime.cheap_clone()), + Self::Amp(_) => None, } } @@ -226,6 +244,7 @@ impl DataSource { Self::Onchain(_) => EntityTypeAccess::Any, Self::Offchain(ds) => EntityTypeAccess::Restriced(ds.mapping.entities.clone()), Self::Subgraph(_) => EntityTypeAccess::Any, + Self::Amp(_) => EntityTypeAccess::Any, } } @@ -234,6 +253,7 @@ impl DataSource { Self::Onchain(ds) => ds.handler_kinds(), Self::Offchain(ds) => vec![ds.handler_kind()].into_iter().collect(), Self::Subgraph(ds) => vec![ds.handler_kind()].into_iter().collect(), + Self::Amp(_) => HashSet::new(), } } @@ -242,6 +262,7 @@ impl DataSource { Self::Onchain(ds) => ds.has_declared_calls(), Self::Offchain(_) => false, Self::Subgraph(_) => false, + Self::Amp(_) => false, } } @@ -268,6 +289,7 @@ impl DataSource { | (Self::Offchain(_), TriggerData::Subgraph(_)) | (Self::Subgraph(_), TriggerData::Onchain(_)) | (Self::Subgraph(_), TriggerData::Offchain(_)) => Ok(None), + (Self::Amp(_), _) => Ok(None), } } @@ -284,6 +306,7 @@ impl DataSource { Self::Onchain(ds) => ds.as_stored_dynamic_data_source(), Self::Offchain(ds) => ds.as_stored_dynamic_data_source(), Self::Subgraph(_) => todo!(), // TODO(krishna) + Self::Amp(_) => unreachable!(), } } @@ -309,6 +332,7 @@ impl DataSource { Self::Onchain(ds) => ds.validate(spec_version), Self::Offchain(_) => vec![], Self::Subgraph(_) => vec![], // TODO(krishna) + Self::Amp(_) => Vec::new(), } } @@ -317,6 +341,7 @@ impl DataSource { Self::Onchain(_) => CausalityRegion::ONCHAIN, Self::Offchain(ds) => ds.causality_region, Self::Subgraph(_) => CausalityRegion::ONCHAIN, + Self::Amp(_) => CausalityRegion::ONCHAIN, } } } @@ -326,13 +351,15 @@ pub enum UnresolvedDataSource { Onchain(C::UnresolvedDataSource), Offchain(offchain::UnresolvedDataSource), Subgraph(subgraph::UnresolvedDataSource), + Amp(amp::manifest::data_source::RawDataSource), } impl UnresolvedDataSource { - pub async fn resolve( + pub async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, + amp_client: Option>, logger: &Logger, manifest_idx: u32, spec_version: &semver::Version, @@ -349,9 +376,10 @@ impl UnresolvedDataSource { .await .map(DataSource::Onchain), Self::Subgraph(unresolved) => unresolved - .resolve::( + .resolve::( deployment_hash, resolver, + amp_client, logger, manifest_idx, spec_version, @@ -364,7 +392,16 @@ impl UnresolvedDataSource { for details see https://github.com/graphprotocol/graph-node/issues/3864" ); } + Self::Amp(raw_data_source) => match amp_client { + Some(amp_client) => raw_data_source + .resolve(logger, resolver.as_ref(), amp_client.as_ref()) + .await + .map(DataSource::Amp) + .map_err(Error::from), + None => Err(anyhow!("support for Amp data sources is not enabled")), + }, } + .with_context(|| format!("failed to resolve data source at index {manifest_idx}")) } } @@ -624,58 +661,95 @@ impl MappingTrigger { } } -macro_rules! clone_data_source { - ($t:ident) => { - impl Clone for $t { - fn clone(&self) -> Self { - match self { - Self::Onchain(ds) => Self::Onchain(ds.clone()), - Self::Offchain(ds) => Self::Offchain(ds.clone()), - Self::Subgraph(ds) => Self::Subgraph(ds.clone()), - } - } +impl Clone for DataSource { + fn clone(&self) -> Self { + match self { + Self::Onchain(ds) => Self::Onchain(ds.clone()), + Self::Offchain(ds) => Self::Offchain(ds.clone()), + Self::Subgraph(ds) => Self::Subgraph(ds.clone()), + Self::Amp(ds) => Self::Amp(ds.clone()), } - }; + } } -clone_data_source!(DataSource); -clone_data_source!(DataSourceTemplate); - -macro_rules! deserialize_data_source { - ($t:ident) => { - impl<'de, C: Blockchain> Deserialize<'de> for $t { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let map: BTreeMap = BTreeMap::deserialize(deserializer)?; - let kind = map - .get("kind") - .ok_or(serde::de::Error::missing_field("kind"))? - .as_str() - .unwrap_or("?"); - if OFFCHAIN_KINDS.contains_key(&kind) { - offchain::$t::deserialize(map.into_deserializer()) - .map_err(serde::de::Error::custom) - .map($t::Offchain) - } else if SUBGRAPH_DS_KIND == kind { - subgraph::$t::deserialize(map.into_deserializer()) - .map_err(serde::de::Error::custom) - .map($t::Subgraph) - } else if (&C::KIND.to_string() == kind) || C::ALIASES.contains(&kind) { - C::$t::deserialize(map.into_deserializer()) - .map_err(serde::de::Error::custom) - .map($t::Onchain) - } else { - Err(serde::de::Error::custom(format!( - "data source has invalid `kind`; expected {}, file/ipfs", - C::KIND, - ))) - } - } +impl Clone for DataSourceTemplate { + fn clone(&self) -> Self { + match self { + Self::Onchain(ds) => Self::Onchain(ds.clone()), + Self::Offchain(ds) => Self::Offchain(ds.clone()), + Self::Subgraph(ds) => Self::Subgraph(ds.clone()), } - }; + } } -deserialize_data_source!(UnresolvedDataSource); -deserialize_data_source!(UnresolvedDataSourceTemplate); +impl<'de, C: Blockchain> Deserialize<'de> for UnresolvedDataSource { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let map: BTreeMap = BTreeMap::deserialize(deserializer)?; + + let kind = map + .get("kind") + .ok_or(serde::de::Error::missing_field("kind"))? + .as_str() + .unwrap_or("?"); + + if OFFCHAIN_KINDS.contains_key(&kind) { + offchain::UnresolvedDataSource::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSource::Offchain) + } else if SUBGRAPH_DS_KIND == kind { + subgraph::UnresolvedDataSource::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSource::Subgraph) + } else if amp::manifest::DataSource::KIND == kind { + amp::manifest::data_source::RawDataSource::deserialize(map.into_deserializer()) + .map(UnresolvedDataSource::Amp) + .map_err(serde::de::Error::custom) + } else if (&C::KIND.to_string() == kind) || C::ALIASES.contains(&kind) { + C::UnresolvedDataSource::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSource::Onchain) + } else { + Err(serde::de::Error::custom(format!( + "data source has invalid `kind`; expected {}, file/ipfs", + C::KIND, + ))) + } + } +} + +impl<'de, C: Blockchain> Deserialize<'de> for UnresolvedDataSourceTemplate { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let map: BTreeMap = BTreeMap::deserialize(deserializer)?; + + let kind = map + .get("kind") + .ok_or(serde::de::Error::missing_field("kind"))? + .as_str() + .unwrap_or("?"); + + if OFFCHAIN_KINDS.contains_key(&kind) { + offchain::UnresolvedDataSourceTemplate::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSourceTemplate::Offchain) + } else if SUBGRAPH_DS_KIND == kind { + subgraph::UnresolvedDataSourceTemplate::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSourceTemplate::Subgraph) + } else if (&C::KIND.to_string() == kind) || C::ALIASES.contains(&kind) { + C::UnresolvedDataSourceTemplate::deserialize(map.into_deserializer()) + .map_err(serde::de::Error::custom) + .map(UnresolvedDataSourceTemplate::Onchain) + } else { + Err(serde::de::Error::custom(format!( + "data source has invalid `kind`; expected {}, file/ipfs", + C::KIND, + ))) + } + } +} diff --git a/graph/src/data_source/subgraph.rs b/graph/src/data_source/subgraph.rs index 9f20260c6de..c9f01cf4890 100644 --- a/graph/src/data_source/subgraph.rs +++ b/graph/src/data_source/subgraph.rs @@ -28,6 +28,7 @@ use super::{ }, DataSourceTemplateInfo, TriggerWithHandler, }; +use crate::amp; pub const SUBGRAPH_DS_KIND: &str = "subgraph"; @@ -282,10 +283,11 @@ impl UnresolvedDataSource { Ok(()) } - async fn resolve_source_manifest( + async fn resolve_source_manifest( &self, deployment_hash: &DeploymentHash, resolver: &Arc, + amp_client: Option>, logger: &Logger, ) -> Result>, Error> { let resolver: Arc = @@ -319,7 +321,13 @@ impl UnresolvedDataSource { let resolver: Arc = Arc::from(resolver.for_manifest(&self.source.address.to_string())?); source_manifest - .resolve(&deployment_hash, &resolver, logger, LATEST_VERSION.clone()) + .resolve( + &deployment_hash, + &resolver, + amp_client, + logger, + LATEST_VERSION.clone(), + ) .await .context(format!( "Failed to resolve source subgraph [{}] manifest", @@ -329,9 +337,10 @@ impl UnresolvedDataSource { } /// Recursively verifies that all grafts in the chain meet the minimum spec version requirement for a subgraph source - async fn verify_graft_chain_sourcable( + async fn verify_graft_chain_sourcable( manifest: Arc>, resolver: &Arc, + amp_client: Option>, logger: &Logger, graft_chain: &mut Vec, ) -> Result<(), Error> { @@ -364,13 +373,20 @@ impl UnresolvedDataSource { let graft_manifest = UnresolvedSubgraphManifest::::parse(graft.base.clone(), graft_raw) .context("Failed to parse graft base manifest")? - .resolve(&manifest.id, resolver, logger, LATEST_VERSION.clone()) + .resolve( + &manifest.id, + resolver, + amp_client.cheap_clone(), + logger, + LATEST_VERSION.clone(), + ) .await .context("Failed to resolve graft base manifest")?; Box::pin(Self::verify_graft_chain_sourcable( Arc::new(graft_manifest), resolver, + amp_client, logger, graft_chain, )) @@ -380,10 +396,12 @@ impl UnresolvedDataSource { Ok(()) } - pub(super) async fn resolve( + #[allow(dead_code)] + pub(super) async fn resolve( self, deployment_hash: &DeploymentHash, resolver: &Arc, + amp_client: Option>, logger: &Logger, manifest_idx: u32, spec_version: &semver::Version, @@ -396,7 +414,12 @@ impl UnresolvedDataSource { let kind = self.kind.clone(); let source_manifest = self - .resolve_source_manifest::(deployment_hash, resolver, logger) + .resolve_source_manifest::( + deployment_hash, + resolver, + amp_client.cheap_clone(), + logger, + ) .await?; let source_spec_version = &source_manifest.spec_version; if source_spec_version < &SPEC_VERSION_1_3_0 { @@ -413,6 +436,7 @@ impl UnresolvedDataSource { Self::verify_graft_chain_sourcable( source_manifest.clone(), resolver, + amp_client, logger, &mut graft_chain, ) diff --git a/graph/src/env/amp.rs b/graph/src/env/amp.rs new file mode 100644 index 00000000000..ef4fff7c1dc --- /dev/null +++ b/graph/src/env/amp.rs @@ -0,0 +1,76 @@ +use std::time::Duration; + +/// Contains environment variables related to Amp subgraphs. +#[derive(Debug)] +pub struct AmpEnv { + /// Maximum number of record batches to buffer in memory per stream for each SQL query. + /// This is the maximum number of record batches that can be output by a single block. + /// + /// Defaults to `1,000`. + pub max_buffer_size: usize, + + /// Maximum number of blocks to request per stream for each SQL query. + /// Limiting this value reduces load on the Amp server when processing heavy queries. + /// + /// Defaults to `2,000,000`. + pub max_block_range: usize, + + /// Minimum time to wait before retrying a failed SQL query to the Amp server. + /// + /// Defaults to `1` second. + pub query_retry_min_delay: Duration, + + /// Maximum time to wait before retrying a failed SQL query to the Amp server. + /// + /// Defaults to `600` seconds. + pub query_retry_max_delay: Duration, + + /// Token used to authenticate Amp Flight gRPC service requests. + /// + /// Defaults to `None`. + pub flight_service_token: Option, +} + +impl AmpEnv { + const DEFAULT_MAX_BUFFER_SIZE: usize = 1_000; + const DEFAULT_MAX_BLOCK_RANGE: usize = 2_000_000; + const DEFAULT_QUERY_RETRY_MIN_DELAY: Duration = Duration::from_secs(1); + const DEFAULT_QUERY_RETRY_MAX_DELAY: Duration = Duration::from_secs(600); + + pub(super) fn new(raw_env: &super::Inner) -> Self { + Self { + max_buffer_size: raw_env + .amp_max_buffer_size + .and_then(|value| { + if value == 0 { + return None; + } + Some(value) + }) + .unwrap_or(Self::DEFAULT_MAX_BUFFER_SIZE), + max_block_range: raw_env + .amp_max_block_range + .and_then(|mut value| { + if value == 0 { + value = usize::MAX; + } + Some(value) + }) + .unwrap_or(Self::DEFAULT_MAX_BLOCK_RANGE), + query_retry_min_delay: raw_env + .amp_query_retry_min_delay_seconds + .map(Duration::from_secs) + .unwrap_or(Self::DEFAULT_QUERY_RETRY_MIN_DELAY), + query_retry_max_delay: raw_env + .amp_query_retry_max_delay_seconds + .map(Duration::from_secs) + .unwrap_or(Self::DEFAULT_QUERY_RETRY_MAX_DELAY), + flight_service_token: raw_env.amp_flight_service_token.as_ref().and_then(|value| { + if value.is_empty() { + return None; + } + Some(value.to_string()) + }), + } + } +} diff --git a/graph/src/env/mod.rs b/graph/src/env/mod.rs index 3fce087986e..09657c041f5 100644 --- a/graph/src/env/mod.rs +++ b/graph/src/env/mod.rs @@ -1,11 +1,13 @@ +mod amp; mod graphql; mod mappings; mod store; +use std::{collections::HashSet, env::VarError, fmt, str::FromStr, sync::Arc, time::Duration}; + use envconfig::Envconfig; use lazy_static::lazy_static; use semver::Version; -use std::{collections::HashSet, env::VarError, fmt, str::FromStr, time::Duration}; use self::graphql::*; use self::mappings::*; @@ -15,6 +17,8 @@ use crate::{ runtime::gas::CONST_MAX_GAS_PER_HANDLER, }; +pub use self::amp::AmpEnv; + #[cfg(debug_assertions)] use std::sync::Mutex; @@ -50,6 +54,7 @@ pub struct EnvVars { pub graphql: EnvVarsGraphQl, pub mappings: EnvVarsMapping, pub store: EnvVarsStore, + pub amp: Arc, /// Enables query throttling when getting database connections goes over this value. /// Load management can be disabled by setting this to 0. @@ -296,6 +301,7 @@ impl EnvVars { graphql, mappings: mapping_handlers, store, + amp: Arc::new(AmpEnv::new(&inner)), load_threshold: Duration::from_millis(inner.load_threshold_in_ms), load_jail_threshold: inner.load_jail_threshold, @@ -469,7 +475,7 @@ struct Inner { default = "false" )] allow_non_deterministic_fulltext_search: EnvVarBoolean, - #[envconfig(from = "GRAPH_MAX_SPEC_VERSION", default = "1.4.0")] + #[envconfig(from = "GRAPH_MAX_SPEC_VERSION", default = "1.5.0")] max_spec_version: Version, #[envconfig(from = "GRAPH_LOAD_WINDOW_SIZE", default = "300")] load_window_size_in_secs: u64, @@ -587,6 +593,17 @@ struct Inner { default = "false" )] disable_deployment_hash_validation: EnvVarBoolean, + + #[envconfig(from = "GRAPH_AMP_MAX_BUFFER_SIZE")] + amp_max_buffer_size: Option, + #[envconfig(from = "GRAPH_AMP_MAX_BLOCK_RANGE")] + amp_max_block_range: Option, + #[envconfig(from = "GRAPH_AMP_QUERY_RETRY_MIN_DELAY_SECONDS")] + amp_query_retry_min_delay_seconds: Option, + #[envconfig(from = "GRAPH_AMP_QUERY_RETRY_MAX_DELAY_SECONDS")] + amp_query_retry_max_delay_seconds: Option, + #[envconfig(from = "GRAPH_AMP_FLIGHT_SERVICE_TOKEN")] + amp_flight_service_token: Option, } #[derive(Clone, Debug)] diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 05407603f48..cdc50d0f4e2 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -37,6 +37,8 @@ pub mod env; pub mod ipfs; +pub mod amp; + /// Wrapper for spawning tasks that abort on panic, which is our default. mod task_spawn; pub use task_spawn::{ @@ -136,8 +138,7 @@ pub mod prelude { }; pub use crate::components::subgraph::{ BlockState, HostMetrics, InstanceDSTemplateInfo, RuntimeHost, RuntimeHostBuilder, - SubgraphAssignmentProvider, SubgraphInstanceManager, SubgraphRegistrar, - SubgraphVersionSwitchingMode, + SubgraphInstanceManager, SubgraphRegistrar, SubgraphVersionSwitchingMode, }; pub use crate::components::trigger_processor::TriggerProcessor; pub use crate::components::versions::{ApiVersion, FeatureFlag}; diff --git a/node/Cargo.toml b/node/Cargo.toml index 5b7f051efe1..c94bed08b5d 100644 --- a/node/Cargo.toml +++ b/node/Cargo.toml @@ -41,3 +41,6 @@ prometheus = { version = "0.14.0", features = ["push"] } json-structural-diff = { version = "0.2", features = ["colorize"] } globset = "0.4.16" notify = "8.2.0" + +# Dependencies related to Amp subgraphs +tokio-util.workspace = true diff --git a/node/src/bin/manager.rs b/node/src/bin/manager.rs index 9e67a532a8c..a35e543a5a4 100644 --- a/node/src/bin/manager.rs +++ b/node/src/bin/manager.rs @@ -105,6 +105,15 @@ pub struct Opt { pub fork_base: Option, #[clap(long, help = "version label, used for prometheus metrics")] pub version_label: Option, + + #[clap( + long, + value_name = "{HOST:PORT|URL}", + env = "GRAPH_AMP_FLIGHT_SERVICE_ADDRESS", + help = "The address of the Amp Flight gRPC service" + )] + pub amp_flight_service_address: Option, + #[clap(subcommand)] pub cmd: Command, } @@ -1331,6 +1340,7 @@ async fn main() -> anyhow::Result<()> { network_name, ipfs_url, arweave_url, + opt.amp_flight_service_address.clone(), config, metrics_ctx, node_id, diff --git a/node/src/launcher.rs b/node/src/launcher.rs index 8855ef1a954..3bbccf5cf0e 100644 --- a/node/src/launcher.rs +++ b/node/src/launcher.rs @@ -1,27 +1,25 @@ -use anyhow::Result; +use std::{ + io::{BufRead, BufReader}, + path::Path, + time::Duration, +}; +use anyhow::Result; use git_testament::{git_testament, render_testament}; -use graph::futures03::future::TryFutureExt; - -use crate::config::Config; -use crate::helpers::watch_subgraph_updates; -use crate::network_setup::Networks; -use crate::opt::Opt; -use crate::store_builder::StoreBuilder; -use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; use graph::components::link_resolver::{ArweaveClient, FileSizeLimit}; use graph::components::subgraph::Settings; use graph::data::graphql::load_manager::LoadManager; use graph::endpoint::EndpointMetrics; use graph::env::EnvVars; +use graph::futures03::future::TryFutureExt; use graph::prelude::*; use graph::prometheus::Registry; use graph::url::Url; -use graph_core::polling_monitor::{arweave_service, ArweaveService, IpfsService}; -use graph_core::{ - SubgraphAssignmentProvider as IpfsSubgraphAssignmentProvider, SubgraphInstanceManager, - SubgraphRegistrar as IpfsSubgraphRegistrar, +use graph::{ + amp, + blockchain::{Blockchain, BlockchainKind, BlockchainMap}, }; +use graph_core::polling_monitor::{arweave_service, ArweaveService, IpfsService}; use graph_graphql::prelude::GraphQlRunner; use graph_server_http::GraphQLServer as GraphQLQueryServer; use graph_server_index_node::IndexNodeServer; @@ -33,10 +31,14 @@ use graph_store_postgres::{ }; use graphman_server::GraphmanServer; use graphman_server::GraphmanServerConfig; -use std::io::{BufRead, BufReader}; -use std::path::Path; -use std::time::Duration; use tokio::sync::mpsc; +use tokio_util::sync::CancellationToken; + +use crate::config::Config; +use crate::helpers::watch_subgraph_updates; +use crate::network_setup::Networks; +use crate::opt::Opt; +use crate::store_builder::StoreBuilder; git_testament!(TESTAMENT); @@ -256,7 +258,7 @@ fn deploy_subgraph_from_flag( ); } -fn build_subgraph_registrar( +fn build_subgraph_registrar( metrics_registry: Arc, network_store: &Arc, logger_factory: &LoggerFactory, @@ -268,17 +270,43 @@ fn build_subgraph_registrar( subscription_manager: Arc, arweave_service: ArweaveService, ipfs_service: IpfsService, + amp_client: Option>, + cancel_token: CancellationToken, ) -> Arc< - IpfsSubgraphRegistrar< - IpfsSubgraphAssignmentProvider>, + graph_core::subgraph::SubgraphRegistrar< + graph_core::subgraph_provider::SubgraphProvider, SubgraphStore, SubscriptionManager, + AC, >, -> { +> +where + AC: amp::Client + Send + Sync + 'static, +{ let static_filters = ENV_VARS.experimental_static_filters; let sg_count = Arc::new(SubgraphCountMetric::new(metrics_registry.cheap_clone())); - let subgraph_instance_manager = SubgraphInstanceManager::new( + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + + if let Some(amp_client) = amp_client.cheap_clone() { + let amp_instance_manager = graph_core::amp_subgraph::Manager::new( + &logger_factory, + metrics_registry.cheap_clone(), + env_vars.cheap_clone(), + &cancel_token, + network_store.subgraph_store(), + link_resolver.cheap_clone(), + amp_client, + ); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Amp, + Arc::new(amp_instance_manager), + ); + } + + let subgraph_instance_manager = graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), network_store.subgraph_store(), @@ -288,23 +316,34 @@ fn build_subgraph_registrar( link_resolver.clone(), ipfs_service, arweave_service, + amp_client.cheap_clone(), static_filters, ); - // Create IPFS-based subgraph provider - let subgraph_provider = - IpfsSubgraphAssignmentProvider::new(&logger_factory, subgraph_instance_manager, sg_count); + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, + Arc::new(subgraph_instance_manager), + ); + + let subgraph_provider = graph_core::subgraph_provider::SubgraphProvider::new( + &logger_factory, + sg_count.cheap_clone(), + link_resolver.cheap_clone(), + tokio_util::sync::CancellationToken::new(), + subgraph_instance_managers, + ); // Check version switching mode environment variable let version_switching_mode = ENV_VARS.subgraph_version_switching_mode; // Create named subgraph provider for resolving subgraph name->ID mappings - let subgraph_registrar = Arc::new(IpfsSubgraphRegistrar::new( + let subgraph_registrar = Arc::new(graph_core::subgraph::SubgraphRegistrar::new( &logger_factory, link_resolver, Arc::new(subgraph_provider), network_store.subgraph_store(), subscription_manager, + amp_client, blockchain_map, node_id.clone(), version_switching_mode, @@ -359,6 +398,7 @@ pub async fn run( dev_updates: Option>, prometheus_registry: Arc, metrics_registry: Arc, + cancel_token: CancellationToken, ) { // Log version information info!( @@ -459,6 +499,25 @@ pub async fn run( &logger_factory, ); + let amp_client = match opt.amp_flight_service_address.as_deref() { + Some(amp_flight_service_address) => { + let addr = amp_flight_service_address + .parse() + .expect("Invalid Amp Flight service address"); + + let mut amp_client = amp::FlightClient::new(addr) + .await + .expect("Failed to connect to Amp Flight service"); + + if let Some(auth_token) = &env_vars.amp.flight_service_token { + amp_client.set_auth_token(auth_token); + } + + Some(Arc::new(amp_client)) + } + None => None, + }; + start_graphman_server(opt.graphman_port, graphman_server_config).await; let launch_services = |logger: Logger, env_vars: Arc| async move { @@ -493,6 +552,7 @@ pub async fn run( blockchain_map.clone(), network_store.clone(), link_resolver.clone(), + amp_client.cheap_clone(), ); if !opt.disable_block_ingestor { @@ -518,6 +578,8 @@ pub async fn run( subscription_manager, arweave_service, ipfs_service, + amp_client, + cancel_token, ); graph::spawn( diff --git a/node/src/main.rs b/node/src/main.rs index 795b28e05aa..8742e097a34 100644 --- a/node/src/main.rs +++ b/node/src/main.rs @@ -1,11 +1,9 @@ use clap::Parser as _; use git_testament::git_testament; - -use graph::prelude::*; -use graph::{env::EnvVars, log::logger}; - +use graph::{env::EnvVars, log::logger, prelude::*}; use graph_core::polling_monitor::ipfs_service; use graph_node::{launcher, opt}; +use tokio_util::sync::CancellationToken; git_testament!(TESTAMENT); @@ -27,6 +25,8 @@ fn main() { async fn main_inner() { env_logger::init(); + + let cancel_token = shutdown_token(); let env_vars = Arc::new(EnvVars::from_env().unwrap()); let opt = opt::Opt::parse(); @@ -61,6 +61,43 @@ async fn main_inner() { None, prometheus_registry, metrics_registry, + cancel_token, ) .await; } + +fn shutdown_token() -> CancellationToken { + use tokio::signal; + + let cancel_token = CancellationToken::new(); + let cancel_token_clone = cancel_token.clone(); + + async fn shutdown_signal_handler() { + let ctrl_c = async { + signal::ctrl_c().await.unwrap(); + }; + + #[cfg(unix)] + let terminate = async { + signal::unix::signal(signal::unix::SignalKind::terminate()) + .unwrap() + .recv() + .await; + }; + + #[cfg(not(unix))] + let terminate = std::future::pending::<()>(); + + tokio::select! { + _ = ctrl_c => {}, + _ = terminate => {}, + }; + } + + tokio::spawn(async move { + shutdown_signal_handler().await; + cancel_token_clone.cancel(); + }); + + cancel_token +} diff --git a/node/src/manager/commands/run.rs b/node/src/manager/commands/run.rs index 060341fb6e0..473a12e5d17 100644 --- a/node/src/manager/commands/run.rs +++ b/node/src/manager/commands/run.rs @@ -7,25 +7,23 @@ use crate::manager::PanicSubscriptionManager; use crate::network_setup::Networks; use crate::store_builder::StoreBuilder; use crate::MetricsContext; +use graph::amp; use graph::anyhow::bail; use graph::cheap_clone::CheapClone; use graph::components::link_resolver::{ArweaveClient, FileSizeLimit}; use graph::components::network_provider::chain_id_validator; use graph::components::store::DeploymentLocator; -use graph::components::subgraph::Settings; +use graph::components::subgraph::{Settings, SubgraphInstanceManager as _}; use graph::endpoint::EndpointMetrics; use graph::env::EnvVars; use graph::prelude::{ anyhow, tokio, BlockNumber, DeploymentHash, IpfsResolver, LoggerFactory, NodeId, - SubgraphAssignmentProvider, SubgraphCountMetric, SubgraphName, SubgraphRegistrar, - SubgraphStore, SubgraphVersionSwitchingMode, ENV_VARS, + SubgraphCountMetric, SubgraphName, SubgraphRegistrar, SubgraphStore, + SubgraphVersionSwitchingMode, ENV_VARS, }; use graph::slog::{debug, info, Logger}; use graph_core::polling_monitor::{arweave_service, ipfs_service}; -use graph_core::{ - SubgraphAssignmentProvider as IpfsSubgraphAssignmentProvider, SubgraphInstanceManager, - SubgraphRegistrar as IpfsSubgraphRegistrar, -}; +use tokio_util::sync::CancellationToken; fn locate(store: &dyn SubgraphStore, hash: &str) -> Result { let mut locators = store.locators(hash)?; @@ -42,6 +40,7 @@ pub async fn run( _network_name: String, ipfs_url: Vec, arweave_url: String, + amp_flight_service_address: Option, config: Config, metrics_ctx: MetricsContext, node_id: NodeId, @@ -53,6 +52,7 @@ pub async fn run( subgraph, stop_block ); + let cancel_token = CancellationToken::new(); let env_vars = Arc::new(EnvVars::from_env().unwrap()); let metrics_registry = metrics_ctx.registry.clone(); let logger_factory = LoggerFactory::new(logger.clone(), None, metrics_ctx.registry.clone()); @@ -139,10 +139,47 @@ pub async fn run( ); let static_filters = ENV_VARS.experimental_static_filters; - let sg_metrics = Arc::new(SubgraphCountMetric::new(metrics_registry.clone())); - let subgraph_instance_manager = SubgraphInstanceManager::new( + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + + let amp_client = match amp_flight_service_address { + Some(amp_flight_service_address) => { + let addr = amp_flight_service_address + .parse() + .expect("Invalid Amp Flight service address"); + + let mut amp_client = amp::FlightClient::new(addr) + .await + .expect("Failed to connect to Amp Flight service"); + + if let Some(auth_token) = &env_vars.amp.flight_service_token { + amp_client.set_auth_token(auth_token); + } + + let amp_client = Arc::new(amp_client); + let amp_instance_manager = graph_core::amp_subgraph::Manager::new( + &logger_factory, + metrics_registry.cheap_clone(), + env_vars.cheap_clone(), + &cancel_token, + network_store.subgraph_store(), + link_resolver.cheap_clone(), + amp_client.cheap_clone(), + ); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Amp, + Arc::new(amp_instance_manager), + ); + + Some(amp_client) + } + None => None, + }; + + let subgraph_instance_manager = graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), subgraph_store.clone(), @@ -152,24 +189,32 @@ pub async fn run( link_resolver.cheap_clone(), ipfs_service, arweave_service, + amp_client.cheap_clone(), static_filters, ); - // Create IPFS-based subgraph provider - let subgraph_provider = Arc::new(IpfsSubgraphAssignmentProvider::new( + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, + Arc::new(subgraph_instance_manager), + ); + + let subgraph_provider = Arc::new(graph_core::subgraph_provider::SubgraphProvider::new( &logger_factory, - subgraph_instance_manager, - sg_metrics, + sg_metrics.cheap_clone(), + link_resolver.cheap_clone(), + cancel_token.clone(), + subgraph_instance_managers, )); let panicking_subscription_manager = Arc::new(PanicSubscriptionManager {}); - let subgraph_registrar = Arc::new(IpfsSubgraphRegistrar::new( + let subgraph_registrar = Arc::new(graph_core::subgraph::SubgraphRegistrar::new( &logger_factory, link_resolver.cheap_clone(), - subgraph_provider.clone(), + subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, + amp_client, blockchain_map, node_id.clone(), SubgraphVersionSwitchingMode::Instant, @@ -216,7 +261,9 @@ pub async fn run( let locator = locate(subgraph_store.as_ref(), &hash)?; - SubgraphAssignmentProvider::start(subgraph_provider.as_ref(), locator, Some(stop_block)).await; + subgraph_provider + .start_subgraph(locator, Some(stop_block)) + .await; loop { tokio::time::sleep(Duration::from_millis(1000)).await; diff --git a/node/src/opt.rs b/node/src/opt.rs index 9928144396a..3708a7da493 100644 --- a/node/src/opt.rs +++ b/node/src/opt.rs @@ -230,6 +230,14 @@ pub struct Opt { help = "Port for the graphman GraphQL server" )] pub graphman_port: u16, + + #[clap( + long, + value_name = "{HOST:PORT|URL}", + env = "GRAPH_AMP_FLIGHT_SERVICE_ADDRESS", + help = "The address of the Amp Flight gRPC service" + )] + pub amp_flight_service_address: Option, } impl From for config::Opt { diff --git a/runtime/wasm/src/host.rs b/runtime/wasm/src/host.rs index bc5610a63d0..aa079381a94 100644 --- a/runtime/wasm/src/host.rs +++ b/runtime/wasm/src/host.rs @@ -363,6 +363,7 @@ impl RuntimeHostTrait for RuntimeHost { DataSource::Onchain(_) => None, DataSource::Offchain(ds) => ds.done_at(), DataSource::Subgraph(_) => None, + DataSource::Amp(_) => None, } } @@ -371,6 +372,7 @@ impl RuntimeHostTrait for RuntimeHost { DataSource::Onchain(_) => {} DataSource::Offchain(ds) => ds.set_done_at(block), DataSource::Subgraph(_) => {} + DataSource::Amp(_) => {} } } diff --git a/server/index-node/src/resolver.rs b/server/index-node/src/resolver.rs index dbcb4cb93a0..af6fd0888ec 100644 --- a/server/index-node/src/resolver.rs +++ b/server/index-node/src/resolver.rs @@ -7,6 +7,7 @@ use graph::schema::EntityType; use web3::types::Address; use git_testament::{git_testament, CommitKind}; +use graph::amp; use graph::blockchain::{Blockchain, BlockchainKind, BlockchainMap}; use graph::components::link_resolver::LinkResolverContext; use graph::components::store::{BlockPtrForNumber, BlockStore, QueryPermit, Store}; @@ -95,19 +96,25 @@ impl IntoValue for PublicProofOfIndexingResult { /// Resolver for the index node GraphQL API. #[derive(Clone)] -pub struct IndexNodeResolver { +pub struct IndexNodeResolver { logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, + amp_client: Option>, bearer_token: Option, } -impl IndexNodeResolver { +impl IndexNodeResolver +where + S: Store, + AC: amp::Client + Send + Sync + 'static, +{ pub fn new( logger: &Logger, store: Arc, link_resolver: Arc, + amp_client: Option>, bearer_token: Option, blockchain_map: Arc, ) -> Self { @@ -118,6 +125,7 @@ impl IndexNodeResolver { blockchain_map, store, link_resolver, + amp_client, bearer_token, } } @@ -514,6 +522,7 @@ impl IndexNodeResolver { deployment_hash.clone(), raw_yaml, &self.link_resolver, + self.amp_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -531,6 +540,7 @@ impl IndexNodeResolver { deployment_hash.clone(), raw_yaml, &self.link_resolver, + self.amp_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -548,6 +558,7 @@ impl IndexNodeResolver { deployment_hash.clone(), raw_yaml, &self.link_resolver, + self.amp_client.cheap_clone(), &self.logger, max_spec_version, ) @@ -682,7 +693,11 @@ impl IndexNodeResolver { } #[async_trait] -impl BlockPtrForNumber for IndexNodeResolver { +impl BlockPtrForNumber for IndexNodeResolver +where + S: Store, + AC: amp::Client + Send + Sync + 'static, +{ async fn block_ptr_for_number( &self, network: String, @@ -755,7 +770,11 @@ fn entity_changes_to_graphql(entity_changes: Vec) -> r::Value { } #[async_trait] -impl Resolver for IndexNodeResolver { +impl Resolver for IndexNodeResolver +where + S: Store, + AC: amp::Client + Send + Sync + 'static, +{ const CACHEABLE: bool = false; async fn query_permit(&self) -> QueryPermit { diff --git a/server/index-node/src/server.rs b/server/index-node/src/server.rs index 326d633b896..00b62c09ca2 100644 --- a/server/index-node/src/server.rs +++ b/server/index-node/src/server.rs @@ -1,5 +1,7 @@ use graph::{ + amp, blockchain::BlockchainMap, + cheap_clone::CheapClone, components::{ server::server::{start, ServerHandle}, store::Store, @@ -10,16 +12,18 @@ use graph::{ use crate::service::IndexNodeService; /// A GraphQL server based on Hyper. -pub struct IndexNodeServer { +pub struct IndexNodeServer { logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, + amp_client: Option>, } -impl IndexNodeServer +impl IndexNodeServer where S: Store, + AC: amp::Client + Send + Sync + 'static, { /// Creates a new GraphQL server. pub fn new( @@ -27,6 +31,7 @@ where blockchain_map: Arc, store: Arc, link_resolver: Arc, + amp_client: Option>, ) -> Self { let logger = logger_factory.component_logger( "IndexNodeServer", @@ -42,6 +47,7 @@ where blockchain_map, store, link_resolver, + amp_client, } } @@ -62,6 +68,7 @@ where self.blockchain_map.clone(), store, self.link_resolver.clone(), + self.amp_client.cheap_clone(), )); start(logger_for_service.clone(), port, move |req| { diff --git a/server/index-node/src/service.rs b/server/index-node/src/service.rs index d07d9b9e5e3..5aa00058e6c 100644 --- a/server/index-node/src/service.rs +++ b/server/index-node/src/service.rs @@ -15,6 +15,7 @@ use graph::hyper::header::{ }; use graph::hyper::{body::Body, Method, Request, Response, StatusCode}; +use graph::amp; use graph::components::{server::query::ServerError, store::Store}; use graph::data::query::{Query, QueryError, QueryResult, QueryResults}; use graph::prelude::{q, serde_json}; @@ -39,23 +40,26 @@ impl GraphQLMetrics for NoopGraphQLMetrics { /// A Hyper Service that serves GraphQL over a POST / endpoint. #[derive(Debug)] -pub struct IndexNodeService { +pub struct IndexNodeService { logger: Logger, blockchain_map: Arc, store: Arc, explorer: Arc>, link_resolver: Arc, + amp_client: Option>, } -impl IndexNodeService +impl IndexNodeService where S: Store, + AC: amp::Client + Send + Sync + 'static, { pub fn new( logger: Logger, blockchain_map: Arc, store: Arc, link_resolver: Arc, + amp_client: Option>, ) -> Self { let explorer = Arc::new(Explorer::new(store.clone())); @@ -65,6 +69,7 @@ where store, explorer, link_resolver, + amp_client, } } @@ -138,6 +143,7 @@ where &logger, store, self.link_resolver.clone(), + self.amp_client.cheap_clone(), validated.bearer_token, self.blockchain_map.clone(), ); diff --git a/store/test-store/tests/chain/ethereum/manifest.rs b/store/test-store/tests/chain/ethereum/manifest.rs index b72f70dcd78..f52930f71bd 100644 --- a/store/test-store/tests/chain/ethereum/manifest.rs +++ b/store/test-store/tests/chain/ethereum/manifest.rs @@ -4,6 +4,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use graph::amp; use graph::blockchain::DataSource; use graph::components::store::BLOCK_NUMBER_MAX; use graph::data::store::scalar::Bytes; @@ -138,7 +139,15 @@ async fn try_resolve_manifest( let resolver: Arc = Arc::new(resolver); let raw = serde_yaml::from_str(text)?; - Ok(SubgraphManifest::resolve_from_raw(id, raw, &resolver, &LOGGER, max_spec_version).await?) + Ok(SubgraphManifest::resolve_from_raw( + id, + raw, + &resolver, + Option::>::None, + &LOGGER, + max_spec_version, + ) + .await?) } async fn resolve_manifest( @@ -160,9 +169,16 @@ async fn resolve_unvalidated(text: &str) -> UnvalidatedSubgraphManifest { let resolver: Arc = Arc::new(resolver); let raw = serde_yaml::from_str(text).unwrap(); - UnvalidatedSubgraphManifest::resolve(id, raw, &resolver, &LOGGER, SPEC_VERSION_0_0_4.clone()) - .await - .expect("Parsing simple manifest works") + UnvalidatedSubgraphManifest::resolve( + id, + raw, + &resolver, + Option::>::None, + &LOGGER, + SPEC_VERSION_0_0_4.clone(), + ) + .await + .expect("Parsing simple manifest works") } // Some of these manifest tests should be made chain-independent, but for @@ -1313,6 +1329,7 @@ schema: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1365,6 +1382,7 @@ schema: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1441,6 +1459,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1519,6 +1538,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_0_0_4.clone(), ) @@ -1628,6 +1648,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_1_2_0.clone(), ) @@ -1701,6 +1722,7 @@ dataSources: id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_1_3_0.clone(), ) @@ -1851,6 +1873,7 @@ specVersion: 1.3.0 id, raw, &resolver, + Option::>::None, &LOGGER, SPEC_VERSION_1_3_0.clone(), ) diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 3d6a3771a93..268f7c3c6cd 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -24,6 +24,7 @@ tokio = { version = "1.45.1", features = ["rt", "macros", "process"] } # here needs to be kept in sync with the web3 version that the graph crate # uses until then secp256k1 = { version = "0.21", features = ["recovery"] } +tokio-util.workspace = true [dev-dependencies] anyhow = "1.0.100" diff --git a/tests/src/fixture/mod.rs b/tests/src/fixture/mod.rs index 362cef37f44..27daa844c1c 100644 --- a/tests/src/fixture/mod.rs +++ b/tests/src/fixture/mod.rs @@ -9,6 +9,7 @@ use std::time::{Duration, Instant}; use anyhow::Error; use async_stream::stream; +use graph::amp; use graph::blockchain::block_stream::{ BlockRefetcher, BlockStream, BlockStreamBuilder, BlockStreamError, BlockStreamEvent, BlockWithTriggers, FirehoseCursor, @@ -24,7 +25,7 @@ use graph::components::link_resolver::{ use graph::components::metrics::MetricsRegistry; use graph::components::network_provider::ChainName; use graph::components::store::{DeploymentLocator, EthereumCallCache, SourceableStore}; -use graph::components::subgraph::Settings; +use graph::components::subgraph::{Settings, SubgraphInstanceManager as _}; use graph::data::graphql::load_manager::LoadManager; use graph::data::query::{Query, QueryTarget}; use graph::data::subgraph::schema::{SubgraphError, SubgraphHealth}; @@ -42,18 +43,14 @@ use graph::prelude::serde_json::{self, json}; use graph::prelude::{ async_trait, lazy_static, q, r, ApiVersion, BigInt, BlockNumber, DeploymentHash, GraphQlRunner as _, IpfsResolver, LinkResolver, LoggerFactory, NodeId, QueryError, - SubgraphAssignmentProvider, SubgraphCountMetric, SubgraphName, SubgraphRegistrar, - SubgraphStore as _, SubgraphVersionSwitchingMode, TriggerProcessor, + SubgraphCountMetric, SubgraphName, SubgraphRegistrar, SubgraphStore as _, + SubgraphVersionSwitchingMode, TriggerProcessor, }; use graph::schema::InputSchema; use graph_chain_ethereum::chain::RuntimeAdapterBuilder; use graph_chain_ethereum::network::EthereumNetworkAdapters; use graph_chain_ethereum::Chain; use graph_core::polling_monitor::{arweave_service, ipfs_service}; -use graph_core::{ - SubgraphAssignmentProvider as IpfsSubgraphAssignmentProvider, SubgraphInstanceManager, - SubgraphRegistrar as IpfsSubgraphRegistrar, SubgraphTriggerProcessor, -}; use graph_node::manager::PanicSubscriptionManager; use graph_node::{config::Config, store_builder::StoreBuilder}; use graph_runtime_wasm::RuntimeHostBuilder; @@ -158,21 +155,22 @@ pub trait TestChainTrait { pub struct TestContext { pub logger: Logger, - pub provider: Arc< - IpfsSubgraphAssignmentProvider< - SubgraphInstanceManager, - >, - >, + pub provider: Arc, pub store: Arc, pub deployment: DeploymentLocator, pub subgraph_name: SubgraphName, - pub instance_manager: SubgraphInstanceManager, + pub instance_manager: Arc< + graph_core::subgraph::SubgraphInstanceManager< + graph_store_postgres::SubgraphStore, + amp::FlightClient, + >, + >, pub link_resolver: Arc, pub arweave_resolver: Arc, pub env_vars: Arc, pub ipfs: Arc, graphql_runner: Arc, - indexing_status_service: Arc>, + indexing_status_service: Arc>, } #[derive(Deserialize)] @@ -204,12 +202,13 @@ impl TestContext { pub async fn runner( &self, stop_block: BlockPtr, - ) -> graph_core::SubgraphRunner< + ) -> graph_core::subgraph::SubgraphRunner< graph_chain_ethereum::Chain, RuntimeHostBuilder, > { let (logger, deployment, raw) = self.get_runner_context().await; - let tp: Box> = Box::new(SubgraphTriggerProcessor {}); + let tp: Box> = + Box::new(graph_core::subgraph::SubgraphTriggerProcessor {}); let deployment_status_metric = self .instance_manager @@ -233,7 +232,7 @@ impl TestContext { pub async fn runner_substreams( &self, stop_block: BlockPtr, - ) -> graph_core::SubgraphRunner< + ) -> graph_core::subgraph::SubgraphRunner< graph_chain_substreams::Chain, RuntimeHostBuilder, > { @@ -282,10 +281,13 @@ impl TestContext { pub async fn start_and_sync_to(&self, stop_block: BlockPtr) { // In case the subgraph has been previously started. - self.provider.stop(self.deployment.clone()).await; + self.provider + .stop_subgraph(self.deployment.cheap_clone()) + .await; self.provider - .start(self.deployment.clone(), Some(stop_block.number)) + .cheap_clone() + .start_subgraph(self.deployment.cheap_clone(), Some(stop_block.number)) .await; debug!(self.logger, "TEST: syncing to {}", stop_block.number); @@ -302,9 +304,14 @@ impl TestContext { pub async fn start_and_sync_to_error(&self, stop_block: BlockPtr) -> SubgraphError { // In case the subgraph has been previously started. - self.provider.stop(self.deployment.clone()).await; + self.provider + .stop_subgraph(self.deployment.cheap_clone()) + .await; - self.provider.start(self.deployment.clone(), None).await; + self.provider + .cheap_clone() + .start_subgraph(self.deployment.cheap_clone(), None) + .await; wait_for_sync( &self.logger, @@ -542,7 +549,8 @@ pub async fn setup_inner( let sg_count = Arc::new(SubgraphCountMetric::new(mock_registry.cheap_clone())); let blockchain_map = Arc::new(blockchain_map); - let subgraph_instance_manager = SubgraphInstanceManager::new( + + let subgraph_instance_manager = Arc::new(graph_core::subgraph::SubgraphInstanceManager::new( &logger_factory, env_vars.cheap_clone(), subgraph_store.clone(), @@ -552,9 +560,26 @@ pub async fn setup_inner( link_resolver.cheap_clone(), ipfs_service, arweave_service, + None, static_filters, + )); + + let mut subgraph_instance_managers = + graph_core::subgraph_provider::SubgraphInstanceManagers::new(); + + subgraph_instance_managers.add( + graph_core::subgraph_provider::SubgraphProcessingKind::Trigger, + subgraph_instance_manager.cheap_clone(), ); + let subgraph_provider = Arc::new(graph_core::subgraph_provider::SubgraphProvider::new( + &logger_factory, + sg_count.cheap_clone(), + link_resolver.cheap_clone(), + tokio_util::sync::CancellationToken::new(), + subgraph_instance_managers, + )); + // Graphql runner let load_manager = LoadManager::new(&logger, Vec::new(), Vec::new(), mock_registry.clone()); let graphql_runner = Arc::new(GraphQlRunner::new( @@ -569,23 +594,18 @@ pub async fn setup_inner( blockchain_map.cheap_clone(), stores.network_store.cheap_clone(), link_resolver.cheap_clone(), - )); - - // Create IPFS-based subgraph provider - let subgraph_provider = Arc::new(IpfsSubgraphAssignmentProvider::new( - &logger_factory, - subgraph_instance_manager.clone(), - sg_count, + None, )); let panicking_subscription_manager = Arc::new(PanicSubscriptionManager {}); - let subgraph_registrar = Arc::new(IpfsSubgraphRegistrar::new( + let subgraph_registrar = Arc::new(graph_core::subgraph::SubgraphRegistrar::new( &logger_factory, link_resolver.cheap_clone(), - subgraph_provider.clone(), + subgraph_provider.cheap_clone(), subgraph_store.clone(), panicking_subscription_manager, + Option::>::None, blockchain_map.clone(), node_id.clone(), SubgraphVersionSwitchingMode::Instant, diff --git a/tests/tests/runner_tests.rs b/tests/tests/runner_tests.rs index cd2c059e2dc..99dacf63b84 100644 --- a/tests/tests/runner_tests.rs +++ b/tests/tests/runner_tests.rs @@ -7,6 +7,7 @@ use std::time::Duration; use assert_json_diff::assert_json_eq; use graph::blockchain::block_stream::BlockWithTriggers; use graph::blockchain::{Block, BlockPtr, Blockchain}; +use graph::components::subgraph::SubgraphInstanceManager as _; use graph::data::store::scalar::Bytes; use graph::data::subgraph::schema::{SubgraphError, SubgraphHealth}; use graph::data::value::Word; @@ -16,7 +17,7 @@ use graph::ipfs::test_utils::add_files_to_local_ipfs_node_for_testing; use graph::object; use graph::prelude::ethabi::ethereum_types::H256; use graph::prelude::web3::types::Address; -use graph::prelude::{hex, CheapClone, SubgraphAssignmentProvider, SubgraphName, SubgraphStore}; +use graph::prelude::{hex, CheapClone, SubgraphName, SubgraphStore}; use graph_tests::fixture::ethereum::{ chain, empty_block, generate_empty_blocks_for_range, genesis, push_test_command, push_test_log, push_test_polling_trigger, @@ -82,7 +83,10 @@ async fn data_source_revert() -> anyhow::Result<()> { let stop_block = test_ptr(2); base_ctx.start_and_sync_to(stop_block).await; - base_ctx.provider.stop(base_ctx.deployment.clone()).await; + base_ctx + .provider + .stop_subgraph(base_ctx.deployment.clone()) + .await; // Test loading data sources from DB. let stop_block = test_ptr(3);