diff --git a/Cargo.toml b/Cargo.toml index a785dcbf..52916a01 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,6 @@ members = [ ] resolver = "2" - [profile.release] opt-level = 3 diff --git a/config.toml b/config.toml new file mode 100644 index 00000000..20ccf5db --- /dev/null +++ b/config.toml @@ -0,0 +1,10 @@ +[patch.'https://github.com/zkMIPS/plonky2'] +#plonky2 = { path = "../plonky2-gpu/plonky2" } +#plonky2_util = { path = "../plonky2-gpu/util" } +#plonky2_field = { path = "../plonky2-gpu/field" } +#plonky2_maybe_rayon = { path = "../plonky2-gpu/maybe_rayon" } +plonky2 = { git = "https://github.com/zkMIPS/plonky2-gpu.git" } +plonky2_util = { git = "https://github.com/zkMIPS/plonky2-gpu.git" } +plonky2_field = { git = "https://github.com/zkMIPS/plonky2-gpu.git" } +plonky2_maybe_rayon = { git = "https://github.com/zkMIPS/plonky2-gpu.git" } + diff --git a/prover/Cargo.toml b/prover/Cargo.toml index e78f7dd8..23e5ecbe 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -16,6 +16,7 @@ bincode = "1.3.3" plonky2 = { git = "https://github.com/zkMIPS/plonky2.git", branch = "zkm_dev" } #starky = { git = "https://github.com/zkMIPS/plonky2.git", branch = "zkm_dev" } +plonky2_field = { git = "https://github.com/zkMIPS/plonky2.git", branch = "zkm_dev" } plonky2_util = { git = "https://github.com/zkMIPS/plonky2.git", branch = "zkm_dev" } plonky2_maybe_rayon = { git = "https://github.com/zkMIPS/plonky2.git", branch = "zkm_dev" } @@ -39,6 +40,8 @@ lazy_static = "1.4.0" elf = { version = "0.7", default-features = false } sha2 = { version = "0.10.8", default-features = false } +rustacuda = { version = "0.1.3", optional = true } +rustacuda_core = { version = "0.1.2", optional = true } [dev-dependencies] env_logger = "0.10.0" @@ -49,6 +52,7 @@ plonky2x-derive = { git = "https://github.com/zkMIPS/succinctx.git", package = " [features] test = [] +gpu = ["rustacuda", "rustacuda_core"] [profile.release] debug = true diff --git a/prover/examples/zkmips.rs b/prover/examples/zkmips.rs index dbe05ef6..f59a9d03 100644 --- a/prover/examples/zkmips.rs +++ b/prover/examples/zkmips.rs @@ -1,3 +1,5 @@ +#![feature(allocator_api)] + use serde::{Deserialize, Serialize}; use std::env; use std::fs::File; @@ -6,6 +8,8 @@ use std::io::BufReader; use std::ops::Range; use std::time::Duration; +use log::LevelFilter; + use plonky2::field::goldilocks_field::GoldilocksField; use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; use plonky2::util::timing::TimingTree; @@ -23,9 +27,25 @@ use zkm_prover::fixed_recursive_verifier::AllRecursiveCircuits; use zkm_prover::generation::state::{AssumptionReceipt, AssumptionReceipts, Receipt}; use zkm_prover::proof; use zkm_prover::proof::PublicValues; +#[cfg(not(feature = "gpu"))] use zkm_prover::prover::prove; use zkm_prover::verifier::verify_proof; +#[cfg(feature = "gpu")] +use plonky2::{ + field::extension::Extendable, + field::fft::fft_root_table, + field::types::Field, + fri::oracle::{create_task, CudaInnerContext, MyAllocator}, + plonk::config::Hasher, +}; +#[cfg(feature = "gpu")] +use rustacuda::{memory::DeviceBuffer, prelude::*}; +#[cfg(feature = "gpu")] +use std::{collections::BTreeMap, sync::Arc}; +#[cfg(feature = "gpu")] +use zkm_prover::prover::prove_gpu; + const DEGREE_BITS_RANGE: [Range; 6] = [10..21, 12..22, 12..21, 8..21, 6..21, 13..23]; fn split_segments() { @@ -67,6 +87,15 @@ type C = PoseidonGoldilocksConfig; type F = >::F; fn prove_single_seg_common(seg_file: &str, basedir: &str, block: &str, file: &str) { + #[cfg(feature = "gpu")] + prove_single_seg_gpu(seg_file, basedir, block, file); + + #[cfg(not(feature = "gpu"))] + prove_single_seg_cpu(seg_file, basedir, block, file); +} + +#[cfg(not(feature = "gpu"))] +fn prove_single_seg_cpu(seg_file: &str, basedir: &str, block: &str, file: &str) { let seg_reader = BufReader::new(File::open(seg_file).unwrap()); let kernel = segment_kernel(basedir, block, file, seg_reader); @@ -87,6 +116,151 @@ fn prove_single_seg_common(seg_file: &str, basedir: &str, block: &str, file: &st log::info!("Prove done"); } +#[cfg(feature = "gpu")] +fn create_gpu_context( + config: &StarkConfig, +) -> plonky2::fri::oracle::CudaInvContext { + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + rustacuda::init(CudaFlags::empty()).unwrap(); + let device_index = 0; + let device = Device::get_device(device_index).unwrap(); + let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device) + .unwrap(); + let stream = Stream::new(StreamFlags::NON_BLOCKING, None).unwrap(); + let stream2 = Stream::new(StreamFlags::NON_BLOCKING, None).unwrap(); + + let max_lg_n = 22; + let max_values_flatten_len = (1 << max_lg_n) * 32; + + let rate_bits = config.fri_config.rate_bits; + let blinding = false; + const SALT_SIZE: usize = 4; + let cap_height = config.fri_config.cap_height; + let salt_size = if blinding { SALT_SIZE } else { 0 }; + + let fft_root_table_max = fft_root_table(1 << (max_lg_n + rate_bits)).concat(); + let root_table_device = { DeviceBuffer::from_slice(&fft_root_table_max).unwrap() }; + + let fft_root_table_ext = + fft_root_table::<>::Extension>(1 << (24)).concat(); + let root_table_ext_device = { DeviceBuffer::from_slice(&fft_root_table_ext).unwrap() }; + + let shift_powers = F::coset_shift() + .powers() + .take(1 << (max_lg_n)) + .collect::>(); + let shift_powers_device = { DeviceBuffer::from_slice(&shift_powers).unwrap() }; + + let shift_powers_ext = <>::Extension>::coset_shift() + .powers() + .take(1 << (22)) + .collect::>(); + let shift_powers_ext_device = { DeviceBuffer::from_slice(&shift_powers_ext).unwrap() }; + + let max_values_num_per_poly = 1 << max_lg_n; + // let max_values_flatten_len = 132644864; + let max_ext_values_flatten_len = + (max_values_flatten_len + salt_size * max_values_num_per_poly) * (1 << rate_bits); + let mut ext_values_flatten: Vec = Vec::with_capacity(max_ext_values_flatten_len); + unsafe { + ext_values_flatten.set_len(max_ext_values_flatten_len); + } + + let mut values_flatten: Vec = + Vec::with_capacity_in(max_values_flatten_len, MyAllocator {}); + unsafe { + values_flatten.set_len(max_values_flatten_len); + } + + let len_cap = 1 << cap_height; + let num_digests = 2 * (max_values_num_per_poly * (1 << rate_bits) - len_cap); + let num_digests_and_caps = num_digests + len_cap; + let mut digests_and_caps_buf: Vec<<>::Hasher as Hasher>::Hash> = + Vec::with_capacity(num_digests_and_caps); + unsafe { + digests_and_caps_buf.set_len(num_digests_and_caps); + } + + let pad_extvalues_len = max_ext_values_flatten_len; + let cache_mem_device = { + unsafe { + DeviceBuffer::::uninitialized( + // values_flatten_len + + pad_extvalues_len + max_ext_values_flatten_len + digests_and_caps_buf.len() * 4, + ) + } + .unwrap() + }; + + let mut ctx = plonky2::fri::oracle::CudaInvContext { + inner: CudaInnerContext { stream, stream2 }, + ext_values_flatten: Arc::new(ext_values_flatten), + values_flatten: Arc::new(values_flatten), + digests_and_caps_buf: Arc::new(digests_and_caps_buf), + cache_mem_device, + root_table_device, + shift_powers_device, + // cache_mem_ext_device, + root_table_ext_device, + shift_powers_ext_device, + tasks: BTreeMap::new(), + ctx: _ctx, + }; + + let use_dynamic_alloc = std::env::var("USE_DYNAMIC_ALLOC").unwrap_or("0".to_string()) == "1"; + + if !use_dynamic_alloc { + for i in 0..18 { + create_task( + &mut ctx, + i, + max_lg_n, + max_values_flatten_len / (1 << max_lg_n), + 0, + 2, + 4, + ); + } + } + + ctx +} + +#[cfg(feature = "gpu")] +fn prove_single_seg_gpu(seg_file: &str, basedir: &str, block: &str, file: &str) { + let seg_reader = BufReader::new(File::open(seg_file).unwrap()); + let kernel = segment_kernel(&basedir, &block, &file, seg_reader); + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let allstark: AllStark = AllStark::default(); + let config = StarkConfig::standard_fast_config(); + + // let mut ctx: plonky2::fri::oracle::CudaInvContext; + let mut ctx = create_gpu_context(&config); + let mut timing = TimingTree::new("prove", log::Level::Info); + let allproof: proof::AllProof = + prove_gpu(&allstark, &kernel, &config, &mut timing, &mut ctx).unwrap(); + + let mut count_bytes = 0; + for (row, proof) in allproof.stark_proofs.clone().iter().enumerate() { + let proof_str = serde_json::to_string(&proof.proof).unwrap(); + log::info!("row:{} proof bytes:{}", row, proof_str.len()); + count_bytes += proof_str.len(); + } + // timing.filter(Duration::from_millis(100)).print(); + timing.print(); + + log::info!("total proof bytes:{}KB", count_bytes / 1024); + verify_proof(&allstark, allproof, &config).unwrap(); + log::info!("Prove done"); +} + fn prove_multi_seg_common( seg_dir: &str, basedir: &str, @@ -94,6 +268,26 @@ fn prove_multi_seg_common( file: &str, seg_file_number: usize, seg_start_id: usize, +) -> anyhow::Result<()> { + #[cfg(feature = "gpu")] + let ret = + prove_multi_seg_common_gpu(seg_dir, basedir, block, file, seg_file_number, seg_start_id); + + #[cfg(not(feature = "gpu"))] + let ret = + prove_multi_seg_common_cpu(seg_dir, basedir, block, file, seg_file_number, seg_start_id); + + ret +} + +#[cfg(not(feature = "gpu"))] +fn prove_multi_seg_common_cpu( + seg_dir: &str, + basedir: &str, + block: &str, + file: &str, + seg_file_number: usize, + seg_start_id: usize, ) -> anyhow::Result<()> { type InnerParameters = DefaultParameters; type OuterParameters = Groth16WrapperParameters; @@ -252,6 +446,183 @@ fn prove_multi_seg_common( result } +#[cfg(feature = "gpu")] +fn prove_multi_seg_common_gpu( + seg_dir: &str, + basedir: &str, + block: &str, + file: &str, + seg_file_number: usize, + seg_start_id: usize, +) -> anyhow::Result<()> { + type InnerParameters = DefaultParameters; + type OuterParameters = Groth16WrapperParameters; + + type F = GoldilocksField; + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + + if seg_file_number < 2 { + panic!("seg file number must >= 2\n"); + } + + let total_timing = TimingTree::new("prove total time", log::Level::Info); + let all_stark = AllStark::::default(); + let config = StarkConfig::standard_fast_config(); + // Preprocess all circuits. + let all_circuits = + AllRecursiveCircuits::::new(&all_stark, &DEGREE_BITS_RANGE, &config); + + let mut ctx = create_gpu_context(&config); + + let seg_file = format!("{}/{}", seg_dir, seg_start_id); + log::info!("Process segment {}", seg_file); + let seg_reader = BufReader::new(File::open(seg_file)?); + let input_first = segment_kernel(basedir, block, file, seg_reader); + let mut timing = TimingTree::new("prove root first", log::Level::Info); + let (mut agg_proof, mut updated_agg_public_values) = + all_circuits.prove_root_gpu(&all_stark, &input_first, &config, &mut timing, &mut ctx)?; + + timing.filter(Duration::from_millis(100)).print(); + all_circuits.verify_root(agg_proof.clone())?; + + let mut base_seg = seg_start_id + 1; + let mut seg_num = seg_file_number - 1; + let mut is_agg = false; + + if seg_file_number % 2 == 0 { + let seg_file = format!("{}/{}", seg_dir, seg_start_id + 1); + log::info!("Process segment {}", seg_file); + let seg_reader = BufReader::new(File::open(seg_file)?); + let input = segment_kernel(basedir, block, file, seg_reader); + timing = TimingTree::new("prove root second", log::Level::Info); + let (root_proof, public_values) = + all_circuits.prove_root_gpu(&all_stark, &input, &config, &mut timing, &mut ctx)?; + timing.filter(Duration::from_millis(100)).print(); + + all_circuits.verify_root(root_proof.clone())?; + + // Update public values for the aggregation. + let agg_public_values = PublicValues { + roots_before: updated_agg_public_values.roots_before, + roots_after: public_values.roots_after, + userdata: public_values.userdata, + }; + timing = TimingTree::new("prove aggression", log::Level::Info); + // We can duplicate the proofs here because the state hasn't mutated. + (agg_proof, updated_agg_public_values) = all_circuits.prove_aggregation( + false, + &agg_proof, + false, + &root_proof, + agg_public_values.clone(), + )?; + timing.filter(Duration::from_millis(100)).print(); + all_circuits.verify_aggregation(&agg_proof)?; + + is_agg = true; + base_seg = seg_start_id + 2; + seg_num -= 1; + } + + for i in 0..seg_num / 2 { + let seg_file = format!("{}/{}", seg_dir, base_seg + (i << 1)); + log::info!("Process segment {}", seg_file); + let seg_reader = BufReader::new(File::open(&seg_file)?); + let input_first = segment_kernel(basedir, block, file, seg_reader); + let mut timing = TimingTree::new("prove root first", log::Level::Info); + let (root_proof_first, first_public_values) = all_circuits.prove_root_gpu( + &all_stark, + &input_first, + &config, + &mut timing, + &mut ctx, + )?; + + timing.filter(Duration::from_millis(100)).print(); + all_circuits.verify_root(root_proof_first.clone())?; + + let seg_file = format!("{}/{}", seg_dir, base_seg + (i << 1) + 1); + log::info!("Process segment {}", seg_file); + let seg_reader = BufReader::new(File::open(&seg_file)?); + let input = segment_kernel(basedir, block, file, seg_reader); + let mut timing = TimingTree::new("prove root second", log::Level::Info); + let (root_proof, public_values) = + all_circuits.prove_root_gpu(&all_stark, &input, &config, &mut timing, &mut ctx)?; + timing.filter(Duration::from_millis(100)).print(); + + all_circuits.verify_root(root_proof.clone())?; + + // Update public values for the aggregation. + let new_agg_public_values = PublicValues { + roots_before: first_public_values.roots_before, + roots_after: public_values.roots_after, + userdata: public_values.userdata, + }; + timing = TimingTree::new("prove aggression", log::Level::Info); + // We can duplicate the proofs here because the state hasn't mutated. + let (new_agg_proof, new_updated_agg_public_values) = all_circuits.prove_aggregation( + false, + &root_proof_first, + false, + &root_proof, + new_agg_public_values, + )?; + timing.filter(Duration::from_millis(100)).print(); + all_circuits.verify_aggregation(&new_agg_proof)?; + + // Update public values for the nested aggregation. + let agg_public_values = PublicValues { + roots_before: updated_agg_public_values.roots_before, + roots_after: new_updated_agg_public_values.roots_after, + userdata: new_updated_agg_public_values.userdata, + }; + timing = TimingTree::new("prove nested aggression", log::Level::Info); + + // We can duplicate the proofs here because the state hasn't mutated. + (agg_proof, updated_agg_public_values) = all_circuits.prove_aggregation( + is_agg, + &agg_proof, + true, + &new_agg_proof, + agg_public_values.clone(), + )?; + is_agg = true; + timing.filter(Duration::from_millis(100)).print(); + + all_circuits.verify_aggregation(&agg_proof)?; + } + + let (block_proof, _block_public_values) = + all_circuits.prove_block(None, &agg_proof, updated_agg_public_values)?; + + log::info!( + "proof size: {:?}", + serde_json::to_string(&block_proof.proof).unwrap().len() + ); + let result = all_circuits.verify_block(&block_proof); + + let build_path = "../verifier/data".to_string(); + let path = format!("{}/test_circuit/", build_path); + let builder = WrapperBuilder::::new(); + let mut circuit = builder.build(); + circuit.set_data(all_circuits.block.circuit); + let mut bit_size = vec![32usize; 16]; + bit_size.extend(vec![8; 32]); + bit_size.extend(vec![64; 68]); + let wrapped_circuit = WrappedCircuit::::build( + circuit, + Some((vec![], bit_size)), + ); + log::info!("build finish"); + + let wrapped_proof = wrapped_circuit.prove(&block_proof).unwrap(); + wrapped_proof.save(path).unwrap(); + + total_timing.filter(Duration::from_millis(100)).print(); + result +} + fn prove_sha2_rust() { // 1. split ELF into segs let elf_path = env::var("ELF_PATH").expect("ELF file is missing"); @@ -297,6 +668,7 @@ fn u32_array_to_u8_vec(u32_array: &[u32; 8]) -> Vec { u8_vec } +#[cfg(not(feature = "gpu"))] fn prove_sha_5_precompile( elf_path: &str, seg_path: &str, @@ -342,6 +714,54 @@ fn prove_sha_5_precompile( } } +#[cfg(feature = "gpu")] +fn prove_sha_5_precompile_gpu( + elf_path: &str, + seg_path: &str, + ctx: &mut plonky2::fri::oracle::CudaInvContext, +) -> Receipt<>::F, C, D> { + let mut state = load_elf_with_patch(elf_path, vec![]); + let n: u32 = 5; + let public_input: [u8; 32] = [ + 37, 148, 182, 169, 46, 191, 177, 195, 49, 45, 235, 125, 1, 192, 21, 251, 149, 233, 251, + 233, 189, 123, 198, 181, 39, 175, 7, 129, 62, 199, 185, 16, + ]; + state.add_input_stream(&public_input.to_vec()); + state.add_input_stream(&n.to_le_bytes().to_vec()); + + let (_total_steps, seg_num, mut state) = split_prog_into_segs(state, seg_path, "", 0); + + let value = state.read_public_values::<[u8; 32]>(); + log::info!("public value: {:?}", value); + + assert!(seg_num == 1); + + let all_stark = AllStark::::default(); + let config = StarkConfig::standard_fast_config(); + // Preprocess all circuits. + let all_circuits = + AllRecursiveCircuits::::new(&all_stark, &DEGREE_BITS_RANGE, &config); + + let seg_file: String = format!("{}/{}", seg_path, 0); + log::info!("Process segment {}", seg_file); + let seg_reader = BufReader::new(File::open(seg_file).unwrap()); + let input_first = segment_kernel("", "", "", seg_reader); + let mut timing = TimingTree::new("prove root with gpu first", log::Level::Info); + let (agg_proof, updated_agg_public_values) = all_circuits + .prove_root_gpu(&all_stark, &input_first, &config, &mut timing, ctx) + .unwrap(); + + timing.filter(Duration::from_millis(100)).print(); + all_circuits.verify_root(agg_proof.clone()).unwrap(); + + Receipt:: { + proof: agg_proof, + root_before: u32_array_to_u8_vec(&updated_agg_public_values.roots_before.root), + userdata: updated_agg_public_values.userdata.clone(), + } +} + +#[cfg(not(feature = "gpu"))] fn prove_sha2_precompile() { // 1. split ELF into segs let elf_path = env::var("ELF_PATH").expect("ELF file is missing"); @@ -418,6 +838,94 @@ fn prove_sha2_precompile() { all_circuits.verify_root(agg_proof.clone()).unwrap(); } +#[cfg(feature = "gpu")] +fn prove_sha2_precompile_gpu() { + log::info!("prove sha2 precompile with gpu"); + + let config = StarkConfig::standard_fast_config(); + let mut ctx = create_gpu_context(&config); + + // 1. split ELF into segs + let elf_path = env::var("ELF_PATH").expect("ELF file is missing"); + let precompile_path = env::var("PRECOMPILE_PATH").expect("PRECOMPILE ELF file is missing"); + let seg_path = env::var("SEG_OUTPUT").expect("Segment output path is missing"); + let mut receipts: AssumptionReceipts = vec![]; + let receipt = prove_sha_5_precompile_gpu(&precompile_path, &seg_path, &mut ctx); + + log::info!( + "elf_id: {:?}, data: {:?}", + receipt.root_before, + receipt.userdata + ); + + let image_id = receipt.root_before.clone(); + receipts.push(receipt.into()); + + let mut state = load_elf_with_patch(&elf_path, vec![]); + + let public_input: [u8; 32] = [ + 91, 15, 50, 181, 63, 91, 186, 46, 9, 26, 167, 190, 200, 232, 40, 101, 149, 181, 253, 89, + 24, 150, 142, 102, 14, 67, 78, 221, 18, 205, 95, 28, + ]; + state.add_input_stream(&public_input.to_vec()); + log::info!("expected public value: {:?}", public_input); + + let private_input: [u8; 32] = [ + 37, 148, 182, 169, 46, 191, 177, 195, 49, 45, 235, 125, 1, 192, 21, 251, 149, 233, 251, + 233, 189, 123, 198, 181, 39, 175, 7, 129, 62, 199, 185, 16, + ]; + log::info!("private input value: {:?}", private_input); + state.add_input_stream(&private_input.to_vec()); + + state.add_input_stream(&image_id); + + let (_total_steps, _seg_num, mut state) = split_prog_into_segs(state, &seg_path, "", 0); + + let value = state.read_public_values::<[u8; 32]>(); + log::info!("public value: {:X?}", value); + log::info!("public value: {} in hex", hex::encode(value)); + + let all_stark = AllStark::::default(); + // Preprocess all circuits. + let all_circuits = + AllRecursiveCircuits::::new(&all_stark, &DEGREE_BITS_RANGE, &config); + + let seg_file: String = format!("{}/{}", seg_path, 0); + log::info!("Process segment {}", seg_file); + let seg_reader = BufReader::new(File::open(seg_file).unwrap()); + let kernel = segment_kernel("", "", "", seg_reader); + + let mut timing = TimingTree::new("prove with gpu", log::Level::Info); + let (agg_proof, _updated_agg_public_values, receipts_used) = all_circuits + .prove_root_with_assumption_gpu( + &all_stark, + &kernel, + &config, + &mut timing, + receipts, + &mut ctx, + ) + .unwrap(); + + log::info!("Process assumptions"); + timing = TimingTree::new("prove aggression", log::Level::Info); + + for assumption in receipts_used.borrow_mut().iter_mut() { + let receipt = assumption.1.clone(); + match receipt { + AssumptionReceipt::Proven(receipt) => { + all_circuits.verify_root(receipt.proof.clone()).unwrap(); + } + AssumptionReceipt::Unresolved(assumpt) => { + log::error!("unresolved assumption: {:X?}", assumpt); + } + } + } + log::info!("verify"); + timing.filter(Duration::from_millis(100)).print(); + all_circuits.verify_root(agg_proof.clone()).unwrap(); +} + fn prove_sha2_go() { // 1. split ELF into segs let elf_path = env::var("ELF_PATH").expect("ELF file is missing"); @@ -570,7 +1078,13 @@ fn prove_host() { let host_program = env::var("HOST_PROGRAM").expect("host_program name is missing"); match host_program.as_str() { "sha2_rust" => prove_sha2_rust(), - "sha2_precompile" => prove_sha2_precompile(), + "sha2_precompile" => { + #[cfg(feature = "gpu")] + prove_sha2_precompile_gpu(); + + #[cfg(not(feature = "gpu"))] + prove_sha2_precompile(); + } "sha2_go" => prove_sha2_go(), "revm" => prove_revm(), "add_example" => prove_add_example(), @@ -597,7 +1111,11 @@ fn prove_segments() { } fn main() { - env_logger::try_init().unwrap_or_default(); + //env_logger::try_init().unwrap_or_default(); + let mut builder = env_logger::Builder::from_default_env(); + builder.format_timestamp(None); + builder.filter_level(LevelFilter::Debug); + builder.try_init().unwrap_or_default(); let args: Vec = env::args().collect(); let helper = || { log::info!( diff --git a/prover/p.sh b/prover/p.sh new file mode 100755 index 00000000..92885677 --- /dev/null +++ b/prover/p.sh @@ -0,0 +1,13 @@ +#!/bin/bash +FEAT= +CONF= + +if [ X"$#" != X"0" ]; then + FEAT='--features gpu' + CONF='--config ../config.toml' +fi +echo "$FEAT" +echo "$CONF" + +BASEDIR=../emulator/test-vectors RUST_LOG=info BLOCK_NO=13284491 SEG_FILE_DIR="/tmp/output" SEG_START_ID=0 SEG_NUM=1 SEG_SIZE=262144 \ + cargo run $FEAT $CONF --release --example zkmips prove_segments diff --git a/prover/s.sh b/prover/s.sh new file mode 100755 index 00000000..36b6c1f3 --- /dev/null +++ b/prover/s.sh @@ -0,0 +1,13 @@ +#!/bin/bash +FEAT= +CONF= + +if [ X"$#" != X"0" ]; then + FEAT='--features gpu' + CONF='--config ../config.toml' +fi +echo "$FEAT" +echo "$CONF" + +BASEDIR=../emulator/test-vectors RUST_LOG=info ELF_PATH=../emulator/test-vectors/minigeth BLOCK_NO=13284491 SEG_OUTPUT=/tmp/output SEG_SIZE=262144 ARGS="1" \ + cargo run $FEAT $CONF --release --example zkmips split diff --git a/prover/src/fixed_recursive_verifier.rs b/prover/src/fixed_recursive_verifier.rs index 45cf7024..e85d6830 100644 --- a/prover/src/fixed_recursive_verifier.rs +++ b/prover/src/fixed_recursive_verifier.rs @@ -5,6 +5,8 @@ use std::ops::Range; use hashbrown::HashMap; use itertools::{zip_eq, Itertools}; use plonky2::field::extension::Extendable; +#[cfg(feature = "gpu")] +use plonky2::fri::oracle::CudaInvContext; use plonky2::fri::FriParams; use plonky2::gates::constant::ConstantGate; use plonky2::gates::noop::NoopGate; @@ -37,6 +39,8 @@ use crate::generation::state::{AssumptionReceipts, AssumptionUsage}; use crate::get_challenges::observe_public_values_target; use crate::proof::{MemRootsTarget, PublicValues, PublicValuesTarget, StarkProofWithMetadata}; use crate::prover::{prove, prove_with_assumptions}; +#[cfg(feature = "gpu")] +use crate::prover::{prove_gpu, prove_with_assumptions_gpu}; use crate::recursive_verifier::{ add_common_recursion_gates, add_virtual_public_values, recursive_stark_circuit, set_public_value_targets, PlonkWrapperCircuit, PublicInputs, StarkWrapperCircuit, @@ -752,6 +756,66 @@ where Ok((root_proof, all_proof.public_values)) } + #[cfg(feature = "gpu")] + pub fn prove_root_gpu( + &self, + all_stark: &AllStark, + kernel: &Kernel, + config: &StarkConfig, + timing: &mut TimingTree, + ctx: &mut CudaInvContext, + ) -> anyhow::Result<(ProofWithPublicInputs, PublicValues)> { + // let all_proof = prove::(all_stark, kernel, config, timing)?; + let all_proof = prove_gpu::(all_stark, kernel, config, timing, ctx)?; + verify_proof(all_stark, all_proof.clone(), config).unwrap(); + let mut root_inputs = PartialWitness::new(); + + for table in 0..NUM_TABLES { + let stark_proof = &all_proof.stark_proofs[table]; + let original_degree_bits = stark_proof.proof.recover_degree_bits(config); + let table_circuits = &self.by_table[table]; + let shrunk_proof = table_circuits + .by_stark_size + .get(&original_degree_bits) + .ok_or_else(|| { + anyhow::Error::msg(format!( + "Missing preprocessed circuits for {:?} table with size {}.", + Table::all()[table], + original_degree_bits, + )) + })? + .shrink(stark_proof, &all_proof.ctl_challenges)?; + let index_verifier_data = table_circuits + .by_stark_size + .keys() + .position(|&size| size == original_degree_bits) + .unwrap(); + root_inputs.set_target( + self.root.index_verifier_data[table], + F::from_canonical_usize(index_verifier_data), + ); + root_inputs.set_proof_with_pis_target(&self.root.proof_with_pis[table], &shrunk_proof); + } + + root_inputs.set_verifier_data_target( + &self.root.cyclic_vk, + &self.aggregation.circuit.verifier_only, + ); + + set_public_value_targets( + &mut root_inputs, + &self.root.public_values, + &all_proof.public_values, + ) + .map_err(|_| { + anyhow::Error::msg("Invalid conversion when setting public values targets.") + })?; + + let root_proof = self.root.circuit.prove(root_inputs)?; + + Ok((root_proof, all_proof.public_values)) + } + pub fn prove_root_with_assumption( &self, all_stark: &AllStark, @@ -815,6 +879,77 @@ where Ok((root_proof, all_proof.public_values, receipts)) } + #[cfg(feature = "gpu")] + pub fn prove_root_with_assumption_gpu( + &self, + all_stark: &AllStark, + kernel: &Kernel, + config: &StarkConfig, + timing: &mut TimingTree, + assumptions: AssumptionReceipts, + ctx: &mut CudaInvContext, + ) -> anyhow::Result<( + ProofWithPublicInputs, + PublicValues, + Rc>>, + )> { + let (all_proof, receipts) = prove_with_assumptions_gpu::( + all_stark, + kernel, + config, + timing, + assumptions, + ctx, + )?; + verify_proof(all_stark, all_proof.clone(), config).unwrap(); + let mut root_inputs = PartialWitness::new(); + + for table in 0..NUM_TABLES { + let stark_proof = &all_proof.stark_proofs[table]; + let original_degree_bits = stark_proof.proof.recover_degree_bits(config); + let table_circuits = &self.by_table[table]; + let shrunk_proof = table_circuits + .by_stark_size + .get(&original_degree_bits) + .ok_or_else(|| { + anyhow::Error::msg(format!( + "Missing preprocessed circuits for {:?} table with size {}.", + Table::all()[table], + original_degree_bits, + )) + })? + .shrink(stark_proof, &all_proof.ctl_challenges)?; + let index_verifier_data = table_circuits + .by_stark_size + .keys() + .position(|&size| size == original_degree_bits) + .unwrap(); + root_inputs.set_target( + self.root.index_verifier_data[table], + F::from_canonical_usize(index_verifier_data), + ); + root_inputs.set_proof_with_pis_target(&self.root.proof_with_pis[table], &shrunk_proof); + } + + root_inputs.set_verifier_data_target( + &self.root.cyclic_vk, + &self.aggregation.circuit.verifier_only, + ); + + set_public_value_targets( + &mut root_inputs, + &self.root.public_values, + &all_proof.public_values, + ) + .map_err(|_| { + anyhow::Error::msg("Invalid conversion when setting public values targets.") + })?; + + let root_proof = self.root.circuit.prove(root_inputs)?; + + Ok((root_proof, all_proof.public_values, receipts)) + } + pub fn verify_root(&self, agg_proof: ProofWithPublicInputs) -> anyhow::Result<()> { self.root.circuit.verify(agg_proof) } diff --git a/prover/src/prover.rs b/prover/src/prover.rs index 837a3f85..651cfdd9 100644 --- a/prover/src/prover.rs +++ b/prover/src/prover.rs @@ -40,6 +40,25 @@ use std::{cell::RefCell, rc::Rc}; #[cfg(any(feature = "test", test))] use crate::cross_table_lookup::testutils::check_ctls; +#[cfg(feature = "gpu")] +use plonky2::fri::oracle::CudaInvContext; + +#[cfg(feature = "gpu")] +pub fn prove_gpu( + all_stark: &AllStark, + kernel: &Kernel, + config: &StarkConfig, + timing: &mut TimingTree, + ctx: &mut CudaInvContext, +) -> Result> +where + F: RichField + Extendable, + C: GenericConfig, +{ + let (proof, _outputs) = prove_with_outputs_gpu(all_stark, kernel, config, timing, ctx)?; + Ok(proof) +} + /// Generate traces, then create all STARK proofs. pub fn prove( all_stark: &AllStark, @@ -71,6 +90,46 @@ where Ok((proof, receipts)) } +#[cfg(feature = "gpu")] +pub fn prove_with_assumptions_gpu( + all_stark: &AllStark, + kernel: &Kernel, + config: &StarkConfig, + timing: &mut TimingTree, + assumptions: AssumptionReceipts, + ctx: &mut CudaInvContext, +) -> Result<(AllProof, Rc>>)> +where + F: RichField + Extendable, + C: GenericConfig, +{ + let (proof, _outputs, receipts) = + prove_with_output_and_assumptions_gpu(all_stark, kernel, config, timing, assumptions, ctx)?; + Ok((proof, receipts)) +} + +#[cfg(feature = "gpu")] +pub fn prove_with_outputs_gpu( + all_stark: &AllStark, + kernel: &Kernel, + config: &StarkConfig, + timing: &mut TimingTree, + ctx: &mut CudaInvContext, +) -> Result<(AllProof, GenerationOutputs)> +where + F: RichField + Extendable, + C: GenericConfig, +{ + let (traces, public_values, outputs) = timed!( + timing, + "generate all traces", + generate_traces::(all_stark, kernel, config, timing)? + ); + + let proof = prove_with_traces_gpu(all_stark, config, traces, public_values, timing, ctx)?; + Ok((proof, outputs)) +} + /// Generate traces, then create all STARK proofs. Returns information about the post-state, /// intended for debugging, in addition to the proof. pub fn prove_with_outputs( @@ -126,6 +185,183 @@ where Ok((proof, outputs, receipts)) } +/// Generate traces, then create all STARK proofs. Returns information about the post-state, +/// intended for debugging, in addition to the proof. +#[cfg(feature = "gpu")] +pub fn prove_with_output_and_assumptions_gpu( + all_stark: &AllStark, + kernel: &Kernel, + config: &StarkConfig, + timing: &mut TimingTree, + assumptions: AssumptionReceipts, + ctx: &mut CudaInvContext, +) -> Result<( + AllProof, + GenerationOutputs, + Rc>>, +)> +where + F: RichField + Extendable, + C: GenericConfig, +{ + let (traces, public_values, outputs, receipts) = timed!( + timing, + "generate all traces", + generate_traces_with_assumptions::( + all_stark, + kernel, + config, + timing, + assumptions + )? + ); + + let proof = prove_with_traces_gpu(all_stark, config, traces, public_values, timing, ctx)?; + Ok((proof, outputs, receipts)) +} + +#[cfg(feature = "gpu")] +pub(crate) fn prove_with_traces_gpu( + all_stark: &AllStark, + config: &StarkConfig, + trace_poly_values: [Vec>; NUM_TABLES], + public_values: PublicValues, + timing: &mut TimingTree, + ctx: &mut CudaInvContext, +) -> Result> +where + F: RichField + Extendable, + C: GenericConfig, +{ + let rate_bits = config.fri_config.rate_bits; + let cap_height = config.fri_config.cap_height; + + // println!("rate_bits: {}, cap_height: {}", rate_bits, cap_height); + let trace_commitments = timed!( + timing, + "compute all trace commitments", + trace_poly_values + .iter() + .zip_eq(Table::all()) + .map(|(trace, table)| { + timed!( + timing, + &format!("compute trace commitment for {:?}", table), + { + // println!( + // "trace len: {}, values len: {}", + // trace.len(), + // trace[0].values.len() + // ); + let ret = { + // let trace_flatten = &trace + // .par_iter() + // .flat_map(|p| p.values.to_vec()) + // .collect::>(); + + PolynomialBatch::from_values_with_gpu( + trace, + trace.len(), + trace[0].values.len(), + rate_bits, + false, + cap_height, + timing, + ctx, + table as u32, + ) + }; + + // let ret = PolynomialBatch::::from_values( + // // TODO: Cloning this isn't great; consider having `from_values` accept a reference, + // // or having `compute_permutation_z_polys` read trace values from the `PolynomialBatch`. + // trace.clone(), + // rate_bits, + // false, + // cap_height, + // timing, + // None, + // ); + + ret + } + ) + }) + .collect::>() + ); + + log::debug!("trace_commitments: {}", trace_commitments.len()); + + // exit(0); + #[cfg(any(feature = "test", test))] + { + log::debug!("check_ctls..."); + check_ctls( + &trace_poly_values, + &all_stark.cross_table_lookups, + // &get_memory_extra_looking_values(&public_values), + ); + log::debug!("check_ctls done."); + } + + let trace_caps = trace_commitments + .iter() + .map(|c| c.merkle_tree.cap.clone()) + .collect::>(); + let mut challenger = Challenger::::new(); + for cap in &trace_caps { + challenger.observe_cap(cap); + } + + observe_public_values::(&mut challenger, &public_values) + .map_err(|_| anyhow::Error::msg("Invalid conversion of public values."))?; + + let ctl_challenges = get_grand_product_challenge_set(&mut challenger, config.num_challenges); + let ctl_data_per_table = timed!( + timing, + "compute CTL data", + cross_table_lookup_data::( + &trace_poly_values, + &all_stark.cross_table_lookups, + &ctl_challenges, + all_stark.arithmetic_stark.constraint_degree() + ) + ); + + let stark_proofs = timed!( + timing, + "compute all proofs given commitments", + prove_with_commitments_gpu( + all_stark, + config, + trace_poly_values, + trace_commitments, + ctl_data_per_table, + &mut challenger, + &ctl_challenges, + timing, + ctx, + )? + ); + + /* + #[cfg(test)] + { + check_ctls( + &trace_poly_values, + &all_stark.cross_table_lookups, + &get_memory_extra_looking_values(&public_values), + ); + } + */ + + Ok(AllProof { + stark_proofs, + ctl_challenges, + public_values, + }) +} + /// Compute all STARK proofs. pub(crate) fn prove_with_traces( all_stark: &AllStark, @@ -346,6 +582,130 @@ where ]) } +#[cfg(feature = "gpu")] +fn prove_with_commitments_gpu( + all_stark: &AllStark, + config: &StarkConfig, + trace_poly_values: [Vec>; NUM_TABLES], + trace_commitments: Vec>, + ctl_data_per_table: [CtlData; NUM_TABLES], + challenger: &mut Challenger, + ctl_challenges: &GrandProductChallengeSet, + timing: &mut TimingTree, + ctx: &mut CudaInvContext, +) -> Result<[StarkProofWithMetadata; NUM_TABLES]> +where + F: RichField + Extendable, + C: GenericConfig, +{ + let arithmetic_proof = timed!( + timing, + "prove Arithmetic STARK", + prove_single_table_gpu( + &all_stark.arithmetic_stark, + config, + &trace_poly_values[Table::Arithmetic as usize], + &trace_commitments[Table::Arithmetic as usize], + &ctl_data_per_table[Table::Arithmetic as usize], + ctl_challenges, + challenger, + timing, + ctx, + Table::Arithmetic, + )? + ); + let cpu_proof = timed!( + timing, + "prove CPU STARK", + prove_single_table_gpu( + &all_stark.cpu_stark, + config, + &trace_poly_values[Table::Cpu as usize], + &trace_commitments[Table::Cpu as usize], + &ctl_data_per_table[Table::Cpu as usize], + ctl_challenges, + challenger, + timing, + ctx, + Table::Cpu, + )? + ); + + let poseidon_proof = timed!( + timing, + "prove Poseidon STARK", + prove_single_table_gpu( + &all_stark.poseidon_stark, + config, + &trace_poly_values[Table::Poseidon as usize], + &trace_commitments[Table::Poseidon as usize], + &ctl_data_per_table[Table::Poseidon as usize], + ctl_challenges, + challenger, + timing, + ctx, + Table::Poseidon + )? + ); + let poseidon_sponge_proof = timed!( + timing, + "prove Poseidon sponge STARK", + prove_single_table_gpu( + &all_stark.poseidon_sponge_stark, + config, + &trace_poly_values[Table::PoseidonSponge as usize], + &trace_commitments[Table::PoseidonSponge as usize], + &ctl_data_per_table[Table::PoseidonSponge as usize], + ctl_challenges, + challenger, + timing, + ctx, + Table::PoseidonSponge, + )? + ); + let logic_proof = timed!( + timing, + "prove logic STARK", + prove_single_table_gpu( + &all_stark.logic_stark, + config, + &trace_poly_values[Table::Logic as usize], + &trace_commitments[Table::Logic as usize], + &ctl_data_per_table[Table::Logic as usize], + ctl_challenges, + challenger, + timing, + ctx, + Table::Logic + )? + ); + let memory_proof = timed!( + timing, + "prove memory STARK", + prove_single_table_gpu( + &all_stark.memory_stark, + config, + &trace_poly_values[Table::Memory as usize], + &trace_commitments[Table::Memory as usize], + &ctl_data_per_table[Table::Memory as usize], + ctl_challenges, + challenger, + timing, + ctx, + Table::Memory + )? + ); + + Ok([ + arithmetic_proof, + cpu_proof, + poseidon_proof, + poseidon_sponge_proof, + logic_proof, + memory_proof, + ]) +} + /// Compute proof for a single STARK table. pub(crate) fn prove_single_table( stark: &S, @@ -549,6 +909,284 @@ where }) } +#[cfg(feature = "gpu")] +pub(crate) fn prove_single_table_gpu( + stark: &S, + config: &StarkConfig, + trace_poly_values: &[PolynomialValues], + trace_commitment: &PolynomialBatch, + ctl_data: &CtlData, + ctl_challenges: &GrandProductChallengeSet, + challenger: &mut Challenger, + timing: &mut TimingTree, + ctx: &mut CudaInvContext, + table: Table, +) -> Result> +where + F: RichField + Extendable, + C: GenericConfig, + S: Stark, +{ + let degree = trace_poly_values[0].len(); + let degree_bits = log2_strict(degree); + let fri_params = config.fri_params(degree_bits); + let rate_bits = config.fri_config.rate_bits; + let cap_height = config.fri_config.cap_height; + assert!( + fri_params.total_arities() <= degree_bits + rate_bits - cap_height, + "FRI total reduction arity is too large.", + ); + + let init_challenger_state = challenger.compact(); + + let constraint_degree = stark.constraint_degree(); + let lookup_challenges = timed!( + timing, + "lookup_challenges", + stark.uses_lookups().then(|| { + ctl_challenges + .challenges + .iter() + .map(|ch| ch.beta) + .collect::>() + }) + ); + + let lookups = stark.lookups(); + let lookup_helper_columns = timed!( + timing, + "compute lookup helper columns", + lookup_challenges.as_ref().map(|challenges| { + let mut columns = Vec::new(); + for lookup in &lookups { + for &challenge in challenges { + columns.extend(lookup_helper_columns( + lookup, + trace_poly_values, + challenge, + constraint_degree, + )); + } + } + columns + }) + ); + let num_lookup_columns = lookup_helper_columns.as_ref().map(|v| v.len()).unwrap_or(0); + + let auxiliary_polys = timed!( + timing, + "get auxiliary_polys", + match lookup_helper_columns { + None => { + let mut ctl_polys = ctl_data.ctl_helper_polys(); + ctl_polys.extend(ctl_data.ctl_z_polys()); + ctl_polys + } + Some(mut lookup_columns) => { + lookup_columns.extend(ctl_data.ctl_helper_polys()); + lookup_columns.extend(ctl_data.ctl_z_polys()); + lookup_columns + } + } + ); + assert!(!auxiliary_polys.is_empty(), "No CTL?"); + + // println!( + // "aux, table: {:?} len: {}, values len: {}", + // table, + // auxiliary_polys.len(), + // auxiliary_polys[0].values.len() + // ); + + let auxiliary_polys_commitment = timed!(timing, "compute auxiliary polynomials commitment", { + let poly = if auxiliary_polys.len() > 4 + // if false + { + // let auxiliary_polys_flatten = &auxiliary_polys + // .iter() + // .flat_map(|p| p.values.to_vec()) + // .collect::>(); + PolynomialBatch::from_values_with_gpu( + &auxiliary_polys, + auxiliary_polys.len(), + auxiliary_polys[0].values.len(), + rate_bits, + false, + cap_height, + timing, + ctx, + table as u32 + 6, + ) + } else { + PolynomialBatch::from_values( + auxiliary_polys, + rate_bits, + false, + config.fri_config.cap_height, + timing, + None, + ) + }; + poly + }); + let auxiliary_polys_cap = auxiliary_polys_commitment.merkle_tree.cap.clone(); + timed!( + timing, + "observe aux cap", + challenger.observe_cap(&auxiliary_polys_cap) + ); + + let alphas = timed!( + timing, + "observe aux cap", + challenger.get_n_challenges(config.num_challenges) + ); + + let num_ctl_polys = timed!( + timing, + "get num_ctl_helper_polys", + ctl_data.num_ctl_helper_polys() + ); + if cfg!(test) { + timed!( + timing, + "check_constraints", + check_constraints( + stark, + trace_commitment, + &auxiliary_polys_commitment, + lookup_challenges.as_ref(), + &lookups, + ctl_data, + alphas.clone(), + degree_bits, + num_lookup_columns, + &num_ctl_polys, + ) + ); + } + let quotient_polys = timed!( + timing, + "compute quotient polys", + compute_quotient_polys::::Packing, C, S, D>( + stark, + trace_commitment, + &auxiliary_polys_commitment, + lookup_challenges.as_ref(), + &lookups, + ctl_data, + alphas, + degree_bits, + num_lookup_columns, + &num_ctl_polys, + config, + ) + ); + let all_quotient_chunks: Vec> = timed!( + timing, + "split quotient polys", + quotient_polys + .into_par_iter() + .flat_map(|mut quotient_poly| { + quotient_poly + .trim_to_len(degree * stark.quotient_degree_factor()) + .expect( + "Quotient has failed, the vanishing polynomial is not divisible by Z_H", + ); + // Split quotient into degree-n chunks. + quotient_poly.chunks(degree) + }) + .collect() + ); + + // println!( + // "quotient, table: {:?}, len: {}, values len: {}", + // table, + // all_quotient_chunks.len(), + // all_quotient_chunks[0].len() + // ); + let quotient_commitment = timed!(timing, "compute quotient commitment", { + // if all_quotient_chunks[0].len() == 1048576 + // let all_quotient_chunks_flatten = &all_quotient_chunks.iter().flat_map(|p|p.coeffs.to_vec()).collect::>(); + let values_num_per_poly = all_quotient_chunks[0].len(); + let poly_num = all_quotient_chunks.len(); + + PolynomialBatch::from_coeffs_with_gpu( + all_quotient_chunks.clone(), + values_num_per_poly, + poly_num, + rate_bits, + false, + config.fri_config.cap_height, + timing, + ctx, + table as u32 + 12, + ) + }); + let quotient_polys_cap = quotient_commitment.merkle_tree.cap.clone(); + challenger.observe_cap("ient_polys_cap); + + let zeta = challenger.get_extension_challenge::(); + // To avoid leaking witness data, we want to ensure that our opening locations, `zeta` and + // `g * zeta`, are not in our subgroup `H`. It suffices to check `zeta` only, since + // `(g * zeta)^n = zeta^n`, where `n` is the order of `g`. + let g = F::primitive_root_of_unity(degree_bits); + ensure!( + zeta.exp_power_of_2(degree_bits) != F::Extension::ONE, + "Opening point is in the subgroup." + ); + + let openings = timed!( + timing, + "new stark opening set", + StarkOpeningSet::new( + zeta, + g, + trace_commitment, + &auxiliary_polys_commitment, + "ient_commitment, + stark.num_lookup_helper_columns(config), + &num_ctl_polys, + ) + ); + timed!( + timing, + "observe_openings", + challenger.observe_openings(&openings.to_fri_openings()) + ); + + let initial_merkle_trees = vec![ + trace_commitment, + &auxiliary_polys_commitment, + "ient_commitment, + ]; + + let opening_proof = timed!( + timing, + "compute openings proof", + PolynomialBatch::prove_openings_gpu( + &stark.fri_instance(zeta, g, num_ctl_polys.iter().sum(), num_ctl_polys, config), + &initial_merkle_trees, + challenger, + &fri_params, + timing, + ctx, + ) + ); + + let proof = StarkProof { + trace_cap: trace_commitment.merkle_tree.cap.clone(), + auxiliary_polys_cap, + quotient_polys_cap, + openings, + opening_proof, + }; + Ok(StarkProofWithMetadata { + init_challenger_state, + proof, + }) +} + /// Computes the quotient polynomials `(sum alpha^i C_i(x)) / Z_H(x)` for `alpha` in `alphas`, /// where the `C_i`s are the Stark constraints. fn compute_quotient_polys<'a, F, P, C, S, const D: usize>(