Skip to content

Commit c1c32ba

Browse files
authored
Elasticsearch exporter: Tweak synonyms (#2229)
* Adjust analyzers and query retriever priorities to improve search results between synonyms * Add document headings to the queryable content * Add custom tokenizer and simplify analyzers
1 parent afd6741 commit c1c32ba

File tree

3 files changed

+31
-11
lines changed

3 files changed

+31
-11
lines changed

src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ DistributedTransport transport
6262
SearchInferenceId = endpoint.NoElasticInferenceService ? null : ".elser-2-elastic"
6363
});
6464

65-
6665
public abstract class ElasticsearchExporter<TChannelOptions, TChannel> : IDisposable
6766
where TChannelOptions : CatalogIndexChannelOptionsBase<DocumentationDocument>
6867
where TChannel : CatalogIndexChannel<DocumentationDocument, TChannelOptions>
@@ -149,14 +148,14 @@ protected static string CreateMappingSetting(string synonymSetName) =>
149148
"analysis": {
150149
"analyzer": {
151150
"synonyms_analyzer": {
152-
"tokenizer": "whitespace",
151+
"tokenizer": "group_tokenizer",
153152
"filter": [
154153
"lowercase",
155154
"synonyms_filter"
156155
]
157156
},
158157
"highlight_analyzer": {
159-
"tokenizer": "standard",
158+
"tokenizer": "group_tokenizer",
160159
"filter": [
161160
"lowercase",
162161
"english_stop"
@@ -176,7 +175,11 @@ protected static string CreateMappingSetting(string synonymSetName) =>
176175
}
177176
},
178177
"tokenizer": {
179-
"path_tokenizer": {
178+
"group_tokenizer": {
179+
"type": "char_group",
180+
"tokenize_on_chars": [ "whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}" ]
181+
},
182+
"path_tokenizer": {
180183
"type": "path_hierarchy",
181184
"delimiter": "/"
182185
}
@@ -243,17 +246,24 @@ protected static string CreateMapping(string? inferenceId) =>
243246
},
244247
"stripped_body": {
245248
"type": "text",
246-
"search_analyzer": "highlight_analyzer",
249+
"search_analyzer": "synonyms_analyzer",
247250
"term_vector": "with_positions_offsets"
248-
}
251+
},
252+
"headings": {
253+
"type": "text",
254+
"search_analyzer": "synonyms_analyzer"
255+
},
249256
{{(!string.IsNullOrWhiteSpace(inferenceId) ? AbstractInferenceMapping(inferenceId) : AbstractMapping())}}
250257
}
251258
}
252259
""";
253260

254261
private static string AbstractMapping() =>
255262
"""
256-
, "abstract": { "type": "text" }
263+
, "abstract": {
264+
"type": "text",
265+
"search_analyzer": "synonyms_analyzer"
266+
}
257267
""";
258268

259269
private static string InferenceMapping(string inferenceId) =>
@@ -278,5 +288,4 @@ public void Dispose()
278288

279289
GC.SuppressFinalize(this);
280290
}
281-
282291
}

src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ public record SearchResultItem
7272
public required string Title { get; init; }
7373
public required string Description { get; init; }
7474
public required SearchResultItemParent[] Parents { get; init; }
75+
public string[]? Headings { get; init; }
7576
public float Score { get; init; }
7677
public string? HighlightedBody { get; init; }
7778
}

src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ internal sealed record DocumentDto
3737
[JsonPropertyName("url_segment_count")]
3838
public int UrlSegmentCount { get; init; }
3939

40+
[JsonPropertyName("headings")]
41+
public string[] Headings { get; init; } = [];
42+
4043
[JsonPropertyName("parents")]
4144
public ParentDocumentDto[] Parents { get; init; } = [];
4245

@@ -88,10 +91,15 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger<E
8891

8992
var lexicalSearchRetriever =
9093
((Query)new PrefixQuery(Infer.Field<DocumentDto>(f => f.Title.Suffix("keyword")), searchQuery) { Boost = 10.0f, CaseInsensitive = true }
94+
|| new MatchPhrasePrefixQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Boost = 9.0f }
9195
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Operator = Operator.And, Boost = 8.0f }
9296
|| new MatchBoolPrefixQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Boost = 6.0f }
93-
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Abstract), searchQuery) { Boost = 4.0f }
94-
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.StrippedBody), searchQuery) { Boost = 3.0f }
97+
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Abstract), searchQuery) { Operator = Operator.And, Boost = 5.0f }
98+
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.StrippedBody), searchQuery) { Operator = Operator.And, Boost = 4.5f }
99+
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Headings), searchQuery) { Operator = Operator.And, Boost = 4.5f }
100+
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Abstract), searchQuery) { Operator = Operator.Or, Boost = 4.0f }
101+
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.StrippedBody), searchQuery) { Operator = Operator.Or, Boost = 3.0f }
102+
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Headings), searchQuery) { Operator = Operator.Or, Boost = 3.0f }
95103
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Parents.First().Title), searchQuery) { Boost = 2.0f }
96104
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Fuzziness = 1, Boost = 1.0f }
97105
)
@@ -129,7 +137,8 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger<E
129137
e => e.Title,
130138
e => e.Url,
131139
e => e.Description,
132-
e => e.Parents
140+
e => e.Parents,
141+
e => e.Headings
133142
)
134143
)
135144
)
@@ -193,6 +202,7 @@ private static (int TotalHits, List<SearchResultItem> Results) ProcessSearchResp
193202
Url = doc.Url,
194203
Title = doc.Title,
195204
Description = doc.Description ?? string.Empty,
205+
Headings = doc.Headings,
196206
Parents = doc.Parents.Select(parent => new SearchResultItemParent
197207
{
198208
Title = parent.Title,

0 commit comments

Comments
 (0)