elastic
diff --git a/‎benchmarks/src/main/java/org/elasticsearch/benchmark/_nightly/esql/QueryPlanningBenchmark.java‎
Lines changed: 2 additions & 1 deletion b/‎benchmarks/src/main/java/org/elasticsearch/benchmark/_nightly/esql/QueryPlanningBenchmark.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/changelog/139074.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docs/changelog/139074.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/reference/query-languages/esql/_snippets/commands/layout/completion.md‎
Lines changed: 33 additions & 1 deletion b/‎docs/reference/query-languages/esql/_snippets/commands/layout/completion.md‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎docs/reference/query-languages/esql/_snippets/commands/layout/rerank.md‎
Lines changed: 47 additions & 0 deletions b/‎docs/reference/query-languages/esql/_snippets/commands/layout/rerank.md‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎x-pack/plugin/esql/build.gradle‎
Lines changed: 4 additions & 0 deletions b/‎x-pack/plugin/esql/build.gradle‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java‎
Lines changed: 9 additions & 1 deletion b/‎x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎x-pack/plugin/esql/qa/testFixtures/src/main/resources/completion.csv-spec‎
Lines changed: 14 additions & 0 deletions b/‎x-pack/plugin/esql/qa/testFixtures/src/main/resources/completion.csv-spec‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎x-pack/plugin/esql/qa/testFixtures/src/main/resources/rerank.csv-spec‎
Lines changed: 15 additions & 0 deletions b/‎x-pack/plugin/esql/qa/testFixtures/src/main/resources/rerank.csv-spec‎
Lines changed: 15 additions & 0 deletions
@@ -27,6 +27,7 @@
 import org.elasticsearch.xpack.esql.index.EsIndex;
 import org.elasticsearch.xpack.esql.index.IndexResolution;
 import org.elasticsearch.xpack.esql.inference.InferenceResolution;
+import org.elasticsearch.xpack.esql.inference.InferenceSettings;
 import org.elasticsearch.xpack.esql.optimizer.LogicalOptimizerContext;
 import org.elasticsearch.xpack.esql.optimizer.LogicalPlanOptimizer;
 import org.elasticsearch.xpack.esql.parser.EsqlParser;
@@ -126,7 +127,7 @@ public void setup() {
     }
 
     private LogicalPlan plan(EsqlParser parser, Analyzer analyzer, LogicalPlanOptimizer optimizer, String query) {
-        var parsed = parser.parseQuery(query, new QueryParams(), telemetry);
+        var parsed = parser.parseQuery(query, new QueryParams(), telemetry, new InferenceSettings(Settings.EMPTY));
         var analyzed = analyzer.analyze(parsed);
         var optimized = optimizer.optimize(analyzed);
         return optimized;
 
@@ -0,0 +1,5 @@
+pr: 139074
+summary: "[ESQL][Inference] Introduce usage limits for COMPLETION and RERANK"
+area: ES|QL
+type: enhancement
+issues: []
@@ -6,9 +6,38 @@ stack: preview 9.1.0
 
 The `COMPLETION` command allows you to send prompts and context to a Large Language Model (LLM) directly within your ES|QL queries, to perform text generation tasks.
 
-:::{important}
+:::::{important}
 **Every row processed by the COMPLETION command generates a separate API call to the LLM endpoint.**
 
+::::{tab-set}
+
+:::{tab-item} 9.3.0+
+
+Starting in version 9.3.0, `COMPLETION` automatically limits processing to **100 rows by default** to prevent accidental high consumption and costs. This limit is applied before the `COMPLETION` command executes.
+
+If you need to process more rows, you can adjust the limit using the cluster setting:
+```
+PUT _cluster/settings
+{
+  "persistent": {
+    "esql.command.completion.limit": 500
+  }
+}
+```
+
+You can also disable the command entirely if needed:
+```
+PUT _cluster/settings
+{
+  "persistent": {
+    "esql.command.completion.enabled": false
+  }
+}
+```
+:::
+
+:::{tab-item} 9.1.x - 9.2.x
+
 Be careful to test with small datasets first before running on production data or in automated workflows, to avoid unexpected costs.
 
 Best practices:
@@ -19,6 +48,9 @@ Best practices:
 4. **Monitor usage**: Track your LLM API consumption and costs.
 :::
 
+::::
+:::::
+
 **Syntax**
 
 ::::{tab-set}
 
@@ -7,6 +7,53 @@ stack: preview 9.2.0
 The `RERANK` command uses an inference model to compute a new relevance score
 for an initial set of documents, directly within your ES|QL queries.
 
+:::::{important}
+**RERANK processes each row through an inference model, which impacts performance and costs.**
+
+::::{tab-set}
+
+:::{tab-item} 9.3.0+
+
+Starting in version 9.3.0, `RERANK` automatically limits processing to **1000 rows by default** to prevent accidental high consumption. This limit is applied before the `RERANK` command executes.
+
+If you need to process more rows, you can adjust the limit using the cluster setting:
+```
+PUT _cluster/settings
+{
+  "persistent": {
+    "esql.command.rerank.limit": 5000
+  }
+}
+```
+
+You can also disable the command entirely if needed:
+```
+PUT _cluster/settings
+{
+  "persistent": {
+    "esql.command.rerank.enabled": false
+  }
+}
+```
+:::
+
+:::{tab-item} 9.2.x
+
+No automatic row limit is applied. **You should always use `LIMIT` before or after `RERANK` to control the number of documents processed**, to avoid accidentally reranking large datasets which can result in high latency and increased costs.
+
+For example:
+```esql
+FROM books
+| WHERE title:"search query"
+| SORT _score DESC
+| LIMIT 100  // Limit to top 100 results before reranking
+| RERANK "search query" ON title WITH { "inference_id" : "my_rerank_endpoint" }
+```
+:::
+
+::::
+:::::
+
 **Syntax**
 
 ```esql
 
@@ -55,8 +55,10 @@ dependencies {
   }
   testImplementation project(':test:framework')
   testImplementation(testArtifact(project(xpackModule('core'))))
+  testImplementation(testArtifact(project(xpackModule('inference'))))
   testImplementation project(path: xpackModule('enrich'))
   testImplementation project(path: xpackModule('spatial'))
+  testImplementation project(path: xpackModule('inference'))
   testImplementation project(path: xpackModule('kql'))
   testImplementation project(path: xpackModule('mapper-unsigned-long'))
 
@@ -72,6 +74,8 @@ dependencies {
   testImplementation('org.webjars.npm:fontsource__roboto-mono:4.5.7')
 
   internalClusterTestImplementation project(":modules:mapper-extras")
+  internalClusterTestImplementation project(xpackModule('inference:qa:test-service-plugin'))
+  internalClusterTestImplementation(testArtifact(project(xpackModule('inference'))))
 }
 
 tasks.named("dependencyLicenses").configure {
 
@@ -27,6 +27,7 @@
 import org.elasticsearch.common.collect.Iterators;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.regex.Regex;
+import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.util.BigArrays;
@@ -112,6 +113,7 @@
 import org.elasticsearch.xpack.esql.index.IndexResolution;
 import org.elasticsearch.xpack.esql.inference.InferenceResolution;
 import org.elasticsearch.xpack.esql.inference.InferenceService;
+import org.elasticsearch.xpack.esql.inference.InferenceSettings;
 import org.elasticsearch.xpack.esql.optimizer.LogicalOptimizerContext;
 import org.elasticsearch.xpack.esql.parser.EsqlParser;
 import org.elasticsearch.xpack.esql.parser.QueryParam;
@@ -568,7 +570,7 @@ public static LogicalOptimizerContext unboundLogicalOptimizerContext() {
         mock(ProjectResolver.class),
         mock(IndexNameExpressionResolver.class),
         null,
-        new InferenceService(mock(Client.class)),
+        new InferenceService(mock(Client.class), createMockClusterService()),
         new BlockFactoryProvider(PlannerUtils.NON_BREAKING_BLOCK_FACTORY),
         TEST_PLANNER_SETTINGS,
         new CrossProjectModeDecider(Settings.EMPTY)
@@ -577,6 +579,12 @@ public static LogicalOptimizerContext unboundLogicalOptimizerContext() {
     private static ClusterService createMockClusterService() {
         var service = mock(ClusterService.class);
         doReturn(new ClusterName("test-cluster")).when(service).getClusterName();
+        doReturn(Settings.EMPTY).when(service).getSettings();
+
+        // Create ClusterSettings with the required inference settings
+        var clusterSettings = new ClusterSettings(Settings.EMPTY, new java.util.HashSet<>(InferenceSettings.getSettings()));
+        doReturn(clusterSettings).when(service).getClusterSettings();
+
         return service;
     }
 
 
@@ -59,3 +59,17 @@ title:text                      | completion:keyword
 War and Peace                   | THIS IS A PROMPT: WAR AND PEACE
 War and Peace (Signet Classics) | THIS IS A PROMPT: WAR AND PEACE (SIGNET CLASSICS)
 ;                                                                                                                       
+
+completion followed by stats
+required_capability: completion
+required_capability: match_operator_colon
+
+FROM books METADATA _score
+| WHERE title:"war and peace" AND author:"Tolstoy"
+| COMPLETION CONCAT("This is a prompt: ", title) WITH { "inference_id" : "test_completion" }
+| STATS count=COUNT(*), avg_completion_length = AVG(LENGTH(completion))
+;
+
+count:long | avg_completion_length:double
+4          | 50.75
+;
@@ -294,3 +294,18 @@ The Lord of the Rings - Boxed Set                                | 3.76885509490
 Return of the King Being the Third Part of The Lord of the Rings | 3.6248698234558105 | 9.000900317914784E-4 | 0.001396648003719747
 // end::combine-result[]
 ;
+
+reranker followed by stats
+required_capability: rerank
+required_capability: match_operator_colon
+
+FROM books METADATA _score
+| WHERE title:"war and peace" AND author:"Tolstoy"
+| SORT _score DESC, book_no ASC
+| RERANK "war and peace" ON title WITH { "inference_id" : "test_reranker" }
+| STATS count_book = COUNT(*) WHERE _score >= 0.03
+;
+
+count_book:long
+2
+;