Skip to content

Commit 6f08f64

Browse files
authored
Improve search relevance and ensure less janky 'as you type' results (#2304)
* Improve search relevance and ensure less janky 'as you type' results - Improve search relevance with exact title matching and prefix search capabilities - Add title prefix boosting for smoother autocomplete behavior: - Single character queries (e.g., "A") get high boost (100x) to show only titles starting with that letter - Two character queries get medium boost (4x) - Up to 10 characters uses a term query with no additional boost - This creates a less janky autocomplete experience where results don't shift dramatically as you type - Add bidirectional synonym expansion for query-time boosting - Ensures both .NET and dotnet score high when the page title is just .NET - Synonyms are expanded in both directions so searching either term finds the relevant page - Apply keyword normalizer to all keyword fields for consistent case-insensitive matching - Reorganize and expand config/search.yml with new synonyms (edr, eks, kpi, logsdb, rag, tsvb) and diminish terms (curator, hadoop, glossary) - Remove unused url_segment_count field from document mapping Test plan - Verify all search relevance tests pass with updated expected results - Test prefix searches with short queries (1-2 characters) return sensible results with titles starting with those characters - Verify typing progressively longer queries produces stable results without excessive shifting - Verify synonym expansion works bidirectionally (searching "dotnet" finds ".NET" pages and vice versa) - Confirm exact title matches are properly boosted in search results * Code QoL improvements * fix test
1 parent 865311c commit 6f08f64

File tree

18 files changed

+679
-552
lines changed

18 files changed

+679
-552
lines changed

config/search.yml

Lines changed: 54 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,14 @@
1-
synonyms:
2-
- [ ".net", "c#", "csharp", "dotnet", "net" ]
3-
- [ "esql", "es|ql => esql" ]
4-
- [ "data-stream", "data stream", "datastream => data-streams"]
5-
- [ "data-streams", "data streams", "datastreams"]
6-
- [ "motlp", "managed otlp" ]
7-
- [ "s3", "aws s3", "amazon s3" ]
8-
- [ "es", "elasticsearch" ]
9-
- [ "elastic learned sparse encoder", "elser" ]
10-
- [ "ccs", "cross cluster search", "cross-cluster search", "spaghetti" ]
11-
- [ "cps", "cross project search", "cross-project search" ]
12-
- [ "apm", "application performance monitoring" ]
13-
- [ "ecctl", "elastic cloud control" ]
14-
- [ "opentelemetry", "otel" ]
15-
- [ "elastic distributions of opentelemetry", "edot" ]
16-
- [ "eck", "elastic cloud on kubernetes" ]
17-
- [ "ece", "elastic cloud enterprise" ]
18-
- [ "elv2", "elastic license v2" ]
19-
- [ "kql", "kibana query language" ]
20-
- [ "ccr", "cross-cluster replication", "cross cluster replication" ]
21-
- [ "esaas", "elastic stack as a service" ]
22-
- [ "knn", "k-nearest neighbors" ]
23-
- [ "ech", "elastic cloud hosted", "ess" ]
24-
- [ "elasticsearch service", "elastic cloud" ]
25-
- [ "aws", "amazon" ]
26-
- [ "gcp", "google cloud platform" ]
27-
- [ "ilm", "index lifecycle management" ]
28-
- [ "javascript", "js", "node", "nodejs", "node.js" ]
29-
- [ "edot", "elastic distribution of opentelemetry" ]
30-
- [ "k8s", "kubernetes" ]
31-
- [ "ecs", "elastic common schema" ]
32-
- [ "machine-learning", "machine learning", "ml => machine learning" ]
33-
- [ "eis", "elastic inference service" ]
34-
- [ "traffic filter", "network security" ]
35-
- [ "sso", "single sign-on" ]
36-
- [ "querydsl", "query dsl", "query dsl"]
1+
# Dampen the impact of search results that contain these terms.
2+
# Use sparingly, in general our query relevance should be good enough.
3+
diminish_terms:
4+
- plugin
5+
- client
6+
- integration
7+
- curator
8+
- hadoop
9+
- glossary
3710

11+
# Query rules, to promote certain results over others.
3812
# Use sparingly, in general our query relevance should be good enough.
3913
rules:
4014
# datastreams require this special handling because `datastream` is really unique vs the other spellings.
@@ -65,8 +39,47 @@ rules:
6539
ids:
6640
- /docs/reference/logstash
6741

68-
diminish_terms:
69-
- plugin
70-
- client
71-
- integration
72-
- glossary
42+
43+
synonyms:
44+
- [ ".net", "c#", "csharp", "dotnet", "net" ]
45+
- [ "apm", "application performance monitoring" ]
46+
- [ "aws", "amazon" ]
47+
- [ "ccr", "cross-cluster replication", "cross cluster replication" ]
48+
- [ "ccs", "cross cluster search", "cross-cluster search", "spaghetti" ]
49+
- [ "cps", "cross project search", "cross-project search" ]
50+
- [ "data-stream", "data stream", "datastream => data-streams"]
51+
- [ "data-streams", "data streams", "datastreams"]
52+
- [ "ecctl", "elastic cloud control" ]
53+
- [ "ece", "elastic cloud enterprise" ]
54+
- [ "ech", "elastic cloud hosted", "ess" ]
55+
- [ "eck", "elastic cloud on kubernetes" ]
56+
- [ "ecs", "elastic common schema" ]
57+
- [ "edot", "elastic distribution of opentelemetry" ]
58+
- [ "edr", "endpoint detection response" ]
59+
- [ "eis", "elastic inference service" ]
60+
- [ "eks", "elastic kubernetes service" ]
61+
- [ "elastic distributions of opentelemetry", "edot" ]
62+
- [ "elastic learned sparse encoder", "elser" ]
63+
- [ "elasticsearch service", "elastic cloud" ]
64+
- [ "elv2", "elastic license v2" ]
65+
- [ "es", "elasticsearch" ]
66+
- [ "esaas", "elastic stack as a service" ]
67+
- [ "esql", "es|ql => esql" ]
68+
- [ "gcp", "google cloud platform" ]
69+
- [ "ilm", "index lifecycle management" ]
70+
- [ "javascript", "js", "node", "nodejs", "node.js" ]
71+
- [ "k8s", "kubernetes" ]
72+
- [ "knn", "k-nearest neighbors" ]
73+
- [ "kpi", "key performance indicator" ]
74+
- [ "kql", "kibana query language" ]
75+
- [ "logsdb", "logs datastream", "logs data stream" ]
76+
- [ "machine-learning", "machine learning", "ml => machine learning" ]
77+
- [ "motlp", "managed otlp" ]
78+
- [ "opentelemetry", "otel" ]
79+
- [ "querydsl", "query dsl", "query dsl"]
80+
- [ "rag", "retrieval augmented generation" ]
81+
- [ "s3", "aws s3", "amazon s3" ]
82+
- [ "sso", "single sign-on" ]
83+
- [ "traffic filter", "network security" ]
84+
- [ "tsvb", "time series visual builder" ]
85+

src/Elastic.ApiExplorer/Elasticsearch/OpenApiDocumentExporter.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ private IEnumerable<DocumentationDocument> ConvertToDocuments(OpenApiDocument op
165165
Type = "api",
166166
Url = url,
167167
Title = title,
168+
SearchTitle = title,
168169
Description = description,
169170
Body = body,
170171
StrippedBody = body,

src/Elastic.Documentation.Configuration/Search/SearchConfiguration.cs

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,46 @@
33
// See the LICENSE file in the project root for more information
44

55
using System.Collections.Immutable;
6+
using System.Diagnostics.CodeAnalysis;
67

78
namespace Elastic.Documentation.Configuration.Search;
89

910
public record SearchConfiguration
1011
{
11-
public required IReadOnlyCollection<string> Synonyms { get; init; }
12+
private readonly IReadOnlyDictionary<string, string[]> _synonyms;
13+
14+
public required IReadOnlyDictionary<string, string[]> Synonyms
15+
{
16+
get => _synonyms;
17+
[MemberNotNull(nameof(_synonyms))]
18+
init
19+
{
20+
_synonyms = value;
21+
SynonymBiDirectional = value
22+
.Select(kv => kv.Value.Concat([kv.Key]).ToArray())
23+
.SelectMany(a =>
24+
{
25+
var targets = new List<string[]>();
26+
foreach (var s in a)
27+
{
28+
if (s.Contains(' ') || s.Contains("=>"))
29+
continue;
30+
31+
List<string> newTarget = [s];
32+
newTarget.AddRange(a.Except([s]));
33+
targets.Add(newTarget.ToArray());
34+
}
35+
36+
return targets;
37+
})
38+
.Where(a => a.Length > 1)
39+
.DistinctBy(a => a[0])
40+
.ToDictionary(a => a[0], a => a.Skip(1).ToArray(), StringComparer.OrdinalIgnoreCase);
41+
}
42+
}
43+
44+
public IReadOnlyDictionary<string, string[]> SynonymBiDirectional { get; private set; } = new Dictionary<string, string[]>();
45+
1246
public required IReadOnlyCollection<QueryRule> Rules { get; init; }
1347
public required IReadOnlyCollection<string> DiminishTerms { get; init; }
1448
}
@@ -78,15 +112,18 @@ public static class SearchConfigurationExtensions
78112
public static SearchConfiguration CreateSearchConfiguration(this ConfigurationFileProvider provider)
79113
{
80114
var searchFile = provider.SearchFile;
115+
var synonyms = new Dictionary<string, string[]>();
81116

82117
if (!searchFile.Exists)
83-
return new SearchConfiguration { Synonyms = [], Rules = [], DiminishTerms = [] };
118+
return new SearchConfiguration { Synonyms = synonyms, Rules = [], DiminishTerms = [] };
84119

85120
var searchDto = ConfigurationFileProvider.Deserializer.Deserialize<SearchConfigDto>(searchFile.OpenText());
86-
var flattenedSynonyms = searchDto.Synonyms.Select(sl => string.Join(',', sl)).ToImmutableArray();
121+
synonyms = searchDto.Synonyms
122+
.Where(s => s.Count > 1)
123+
.ToDictionary(k => k[0], sl => sl.Skip(1).ToArray(), StringComparer.OrdinalIgnoreCase);
87124
var rules = searchDto.Rules.Select(ParseRule).ToImmutableArray();
88125
var diminishTerms = searchDto.DiminishTerms.ToImmutableArray();
89-
return new SearchConfiguration { Synonyms = flattenedSynonyms, Rules = rules, DiminishTerms = diminishTerms };
126+
return new SearchConfiguration { Synonyms = synonyms, Rules = rules, DiminishTerms = diminishTerms };
90127
}
91128

92129
private static QueryRule ParseRule(QueryRuleDto dto) =>

src/Elastic.Documentation/Search/DocumentationDocument.cs

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,32 @@ namespace Elastic.Documentation.Search;
1010
public record ParentDocument
1111
{
1212
[JsonPropertyName("title")]
13-
public string? Title { get; set; }
13+
public required string Title { get; set; }
1414

1515
[JsonPropertyName("url")]
16-
public string? Url { get; set; }
16+
public required string Url { get; set; }
1717
}
1818

1919
public record DocumentationDocument
2020
{
21+
[JsonPropertyName("title")]
22+
public required string Title { get; set; }
23+
24+
/// <summary>
25+
/// Search title is a combination of the title and the url components.
26+
/// This is used for querying to not reward documents with short titles contributing to heavily to scoring
27+
/// </summary>
28+
[JsonPropertyName("search_title")]
29+
public required string SearchTitle { get; set; }
30+
2131
[JsonPropertyName("type")]
22-
public string Type { get; set; } = "doc";
32+
public required string Type { get; set; } = "doc";
2333

24-
// TODO make this required once all doc_sets have published again
2534
[JsonPropertyName("url")]
26-
public string Url { get; set; } = string.Empty;
35+
public required string Url { get; set; } = string.Empty;
36+
37+
[JsonPropertyName("hash")]
38+
public string Hash { get; set; } = string.Empty;
2739

2840
[JsonPropertyName("navigation_depth")]
2941
public int NavigationDepth { get; set; } = 50; //default to a high number so that omission gets penalized.
@@ -43,20 +55,6 @@ public record DocumentationDocument
4355
[JsonPropertyName("last_updated")]
4456
public DateTimeOffset LastUpdated { get; set; }
4557

46-
// TODO make this required once all doc_sets have published again
47-
[JsonPropertyName("hash")]
48-
public string Hash { get; set; } = string.Empty;
49-
50-
/// <summary>
51-
/// Search title is a combination of the title and the url components.
52-
/// This is used for querying to not reward documents with short titles contributing to heavily to scoring
53-
/// </summary>
54-
[JsonPropertyName("search_title")]
55-
public string? SearchTitle { get; set; }
56-
57-
[JsonPropertyName("title")]
58-
public string? Title { get; set; }
59-
6058
[JsonPropertyName("description")]
6159
public string? Description { get; set; }
6260

@@ -72,7 +70,7 @@ public record DocumentationDocument
7270
[JsonPropertyName("body")]
7371
public string? Body { get; set; }
7472

75-
// Stripped body is the body with Markdown removed, suitable for search indexing
73+
/// Stripped body is the body with Markdown removed, suitable for search indexing
7674
[JsonPropertyName("stripped_body")]
7775
public string? StrippedBody { get; set; }
7876

0 commit comments

Comments
 (0)