| | | 1 | | using System.Net; |
| | | 2 | | using System.Text; |
| | | 3 | | using System.Text.Json; |
| | | 4 | | using GistBackend.Exceptions; |
| | | 5 | | using GistBackend.Handlers.AIHandler; |
| | | 6 | | using GistBackend.Types; |
| | | 7 | | using GistBackend.Utils; |
| | | 8 | | using Microsoft.Extensions.Logging; |
| | | 9 | | using Microsoft.Extensions.Options; |
| | | 10 | | using static GistBackend.Utils.LogEvents; |
| | | 11 | | |
| | | 12 | | namespace GistBackend.Handlers.ChromaDbHandler; |
| | | 13 | | |
| | | 14 | | public interface IChromaDbHandler |
| | | 15 | | { |
| | | 16 | | Task UpsertEntryAsync(RssEntry entry, string summary, CancellationToken ct); |
| | | 17 | | Task<bool> EnsureGistHasCorrectMetadataAsync(Gist gist, bool disabled, CancellationToken ct); |
| | | 18 | | Task<List<SimilarDocument>> GetReferenceAndScoreOfSimilarEntriesAsync( |
| | | 19 | | string reference, int nResults, IEnumerable<int> disabledFeedIds, CancellationToken ct); |
| | | 20 | | Task<List<SimilarDocument>> SearchSimilarEntriesByQueryAsync(string query, int nResults, |
| | | 21 | | IEnumerable<int> disabledFeedIds, CancellationToken ct); |
| | | 22 | | } |
| | | 23 | | |
| | | 24 | | public class ChromaDbHandler : IChromaDbHandler |
| | | 25 | | { |
| | | 26 | | private readonly Uri _chromaDbUri; |
| | | 27 | | private readonly string _tenantName; |
| | | 28 | | private readonly string _databaseName; |
| | | 29 | | private readonly string _collectionName; |
| | | 30 | | private readonly IAIHandler _aiHandler; |
| | | 31 | | private readonly HttpClient _httpClient; |
| | | 32 | | private readonly string _credentialsHeaderName; |
| | | 33 | | private readonly string _serverAuthnCredentials; |
| | | 34 | | private readonly ILogger<ChromaDbHandler>? _logger; |
| | | 35 | | |
| | 28 | 36 | | public ChromaDbHandler(IAIHandler aiHandler, |
| | 28 | 37 | | HttpClient httpClient, |
| | 28 | 38 | | IOptions<ChromaDbHandlerOptions> options, |
| | 28 | 39 | | ILogger<ChromaDbHandler>? logger) |
| | | 40 | | { |
| | 28 | 41 | | if (string.IsNullOrWhiteSpace(options.Value.Server)) |
| | 0 | 42 | | throw new ArgumentException("Server is not set in the options."); |
| | 28 | 43 | | if (string.IsNullOrWhiteSpace(options.Value.ServerAuthnCredentials)) |
| | 0 | 44 | | throw new ArgumentException("Server authentication credentials are not set in the options."); |
| | 28 | 45 | | _aiHandler = aiHandler; |
| | 28 | 46 | | _httpClient = httpClient; |
| | 28 | 47 | | _logger = logger; |
| | 28 | 48 | | _chromaDbUri = new Uri($"http://{options.Value.Server}:{options.Value.Port}/"); |
| | 28 | 49 | | _credentialsHeaderName = options.Value.CredentialsHeaderName; |
| | 28 | 50 | | _serverAuthnCredentials = options.Value.ServerAuthnCredentials; |
| | 28 | 51 | | _tenantName = options.Value.GistsTenantName; |
| | 28 | 52 | | _databaseName = options.Value.GistsDatabaseName; |
| | 28 | 53 | | _collectionName = options.Value.GistsCollectionName; |
| | 28 | 54 | | } |
| | | 55 | | |
| | 1 | 56 | | private static readonly string[] IncludeOnGet = ["metadatas", "distances"]; |
| | | 57 | | |
| | | 58 | | public async Task<List<SimilarDocument>> GetReferenceAndScoreOfSimilarEntriesAsync(string reference, |
| | | 59 | | int nResults, IEnumerable<int> disabledFeedIds, CancellationToken ct) |
| | | 60 | | { |
| | 6 | 61 | | ValidateReference(reference); |
| | 6 | 62 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 6 | 63 | | if (!await EntryExistsByReferenceAsync(reference, ct, collectionId)) |
| | | 64 | | { |
| | 0 | 65 | | throw new DatabaseOperationException("Entry does not exist in database"); |
| | | 66 | | } |
| | | 67 | | |
| | 6 | 68 | | var document = await GetDocumentByReferenceAsync(reference, collectionId, true, false, ct); |
| | 6 | 69 | | var similarDocuments = await GetSimilarDocumentsByEmbeddingsAsync( |
| | 6 | 70 | | document.Embeddings!.Single(), |
| | 6 | 71 | | nResults + 1, // +1 to exclude the original entry |
| | 6 | 72 | | disabledFeedIds, |
| | 6 | 73 | | ct |
| | 6 | 74 | | ); |
| | | 75 | | |
| | | 76 | | // Exclude the original entry from the results |
| | 20 | 77 | | return similarDocuments.Where(referenceAndScore => referenceAndScore.Reference != reference).ToList(); |
| | 6 | 78 | | } |
| | | 79 | | |
| | | 80 | | public async Task<List<SimilarDocument>> SearchSimilarEntriesByQueryAsync(string query, int nResults, |
| | | 81 | | IEnumerable<int> disabledFeedIds, CancellationToken ct) |
| | | 82 | | { |
| | 0 | 83 | | ValidateSearchQuery(query); |
| | 0 | 84 | | var embeddings = await _aiHandler.GenerateEmbeddingAsync(query, ct); |
| | 0 | 85 | | return await GetSimilarDocumentsByEmbeddingsAsync(embeddings, nResults, disabledFeedIds, ct); |
| | 0 | 86 | | } |
| | | 87 | | |
| | | 88 | | private async Task<List<SimilarDocument>> GetSimilarDocumentsByEmbeddingsAsync(float[] embeddings, int nResults, |
| | | 89 | | IEnumerable<int> disabledFeedIds, CancellationToken ct) |
| | | 90 | | { |
| | 6 | 91 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 6 | 92 | | var content = CreateStringContent(new { |
| | 6 | 93 | | QueryEmbeddings = new[] {embeddings}, |
| | 6 | 94 | | NResults = nResults, |
| | 6 | 95 | | Where = GenerateWhere(disabledFeedIds), |
| | 6 | 96 | | Include = IncludeOnGet |
| | 6 | 97 | | }); |
| | 6 | 98 | | var response = await SendPostRequestAsync( |
| | 6 | 99 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/query", content, ct); |
| | 6 | 100 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 101 | | { |
| | 0 | 102 | | throw await CreateDatabaseOperationExceptionAsync("Could not query similar entries", response, ct); |
| | | 103 | | } |
| | | 104 | | |
| | 6 | 105 | | var responseContent = await response.Content.ReadAsStreamAsync(ct); |
| | 6 | 106 | | var queryResponse = |
| | 6 | 107 | | await JsonSerializer.DeserializeAsync<QueryResponse>(responseContent, SerializerDefaults.JsonOptions, ct); |
| | 6 | 108 | | return queryResponse is null |
| | 6 | 109 | | ? throw new DatabaseOperationException("Could not get similar entries") |
| | 6 | 110 | | : ExtractReferencesAndScores(queryResponse); |
| | 6 | 111 | | } |
| | | 112 | | |
| | | 113 | | private static Dictionary<string, object> GenerateWhere(IEnumerable<int> disabledFeedIds) |
| | | 114 | | { |
| | 6 | 115 | | var whereNotDisabled = new Dictionary<string, object> { |
| | 6 | 116 | | { "disabled", new Dictionary<string, object> { { "$ne", true } } } |
| | 6 | 117 | | }; |
| | 6 | 118 | | var disabledFeedIdsArray = disabledFeedIds.ToArray(); |
| | 6 | 119 | | if (disabledFeedIdsArray.Length == 0) |
| | | 120 | | { |
| | 4 | 121 | | return whereNotDisabled; |
| | | 122 | | } |
| | | 123 | | |
| | 2 | 124 | | var whereNotInDisabledFeeds = new Dictionary<string, object> { |
| | 2 | 125 | | { "feed_id", new Dictionary<string, object> { { "$nin", disabledFeedIdsArray } } } |
| | 2 | 126 | | }; |
| | 2 | 127 | | return new Dictionary<string, object> { |
| | 2 | 128 | | { "$and", new[] { |
| | 2 | 129 | | whereNotDisabled, |
| | 2 | 130 | | whereNotInDisabledFeeds |
| | 2 | 131 | | } } |
| | 2 | 132 | | }; |
| | | 133 | | } |
| | | 134 | | |
| | | 135 | | private static List<SimilarDocument> ExtractReferencesAndScores(QueryResponse queryResponse) => |
| | 6 | 136 | | Enumerable.Range(0, queryResponse.Ids.First().Length).Select(i => |
| | 14 | 137 | | new SimilarDocument( |
| | 14 | 138 | | queryResponse.Metadatas.First()[i].Reference, |
| | 14 | 139 | | ConvertCosineDistanceToSimilarity(queryResponse.Distances.First()[i]) |
| | 14 | 140 | | )) |
| | 6 | 141 | | .ToList(); |
| | | 142 | | |
| | 14 | 143 | | private static float ConvertCosineDistanceToSimilarity(float distance) => float.Clamp(1 - distance/2, 0, 1); |
| | | 144 | | |
| | | 145 | | public async Task UpsertEntryAsync(RssEntry entry, string summary, CancellationToken ct) |
| | | 146 | | { |
| | 27 | 147 | | ValidateReference(entry.Reference); |
| | 25 | 148 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 25 | 149 | | var mode = "add"; |
| | 25 | 150 | | if (await EntryExistsByReferenceAsync(entry.Reference, ct, collectionId)) |
| | | 151 | | { |
| | 1 | 152 | | _logger?.LogInformation(EntryAlreadyExistsInChromaDb, |
| | 1 | 153 | | "Entry with reference {Reference} already exists in database", entry.Reference); |
| | 1 | 154 | | mode = "update"; |
| | | 155 | | } |
| | | 156 | | |
| | 25 | 157 | | var metadata = new Metadata(entry.Reference, entry.FeedId); |
| | 25 | 158 | | var embedding = await _aiHandler.GenerateEmbeddingAsync(summary, ct); |
| | 25 | 159 | | var content = CreateStringContent(new Document([entry.Reference], [metadata], [embedding])); |
| | 25 | 160 | | var response = await SendPostRequestAsync( |
| | 25 | 161 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/{mode}", content, ct); |
| | | 162 | | |
| | 25 | 163 | | if (mode == "add" && response.StatusCode != HttpStatusCode.Created || |
| | 25 | 164 | | mode == "update" && response.StatusCode != HttpStatusCode.OK) |
| | | 165 | | { |
| | 0 | 166 | | throw await CreateDatabaseOperationExceptionAsync($"Could not {mode} entry", response, ct); |
| | | 167 | | } |
| | 25 | 168 | | _logger?.LogInformation(DocumentInserted, |
| | 25 | 169 | | "Upserted ({Mode}) document with metadata {Metadata} for entry with reference {Reference}", |
| | 25 | 170 | | mode, metadata, entry.Reference); |
| | 25 | 171 | | } |
| | | 172 | | |
| | | 173 | | public async Task<bool> EnsureGistHasCorrectMetadataAsync(Gist gist, bool disabled, CancellationToken ct) |
| | | 174 | | { |
| | 13 | 175 | | ValidateReference(gist.Reference); |
| | 9 | 176 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 9 | 177 | | var document = await GetDocumentByReferenceAsync(gist.Reference, collectionId, false, true, ct); |
| | 9 | 178 | | var oldMetadata = document.Metadatas.FirstOrDefault(); |
| | 9 | 179 | | if (oldMetadata is null) |
| | | 180 | | { |
| | 2 | 181 | | throw new DatabaseOperationException($"Entry with reference {gist.Reference} does not exist in ChromaDb"); |
| | | 182 | | } |
| | 9 | 183 | | if (oldMetadata.Disabled == disabled && oldMetadata.FeedId == gist.FeedId) return true; |
| | 5 | 184 | | var newMetaData = new Metadata(gist.Reference, gist.FeedId, disabled); |
| | 5 | 185 | | await UpdateMetadataAsync(gist.Reference, newMetaData, ct); |
| | 5 | 186 | | _logger?.LogInformation(ChangedMetadataOfGistInChromaDb, |
| | 5 | 187 | | "Changed metadata from {OldMetadata} to {NewMetadata} for gist with reference {GistReference}", |
| | 5 | 188 | | oldMetadata, newMetaData, gist.Reference); |
| | 5 | 189 | | return false; |
| | 7 | 190 | | } |
| | | 191 | | |
| | | 192 | | private async Task UpdateMetadataAsync(string reference, Metadata metadata, CancellationToken ct) |
| | | 193 | | { |
| | 5 | 194 | | ValidateReference(reference); |
| | 5 | 195 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 5 | 196 | | if (!await EntryExistsByReferenceAsync(reference, ct, collectionId)) |
| | | 197 | | { |
| | 0 | 198 | | throw new DatabaseOperationException("Entry to update does not exist"); |
| | | 199 | | } |
| | 5 | 200 | | var content = CreateStringContent(new Document([reference], [metadata])); |
| | 5 | 201 | | var response = await SendPostRequestAsync( |
| | 5 | 202 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/update", content, ct); |
| | | 203 | | |
| | 5 | 204 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 205 | | { |
| | 0 | 206 | | throw await CreateDatabaseOperationExceptionAsync("Could not update entry", response, ct); |
| | | 207 | | } |
| | 5 | 208 | | } |
| | | 209 | | |
| | | 210 | | public async Task<bool> EntryExistsByReferenceAsync(string reference, CancellationToken ct, string? collectionId = n |
| | | 211 | | { |
| | 36 | 212 | | collectionId ??= await GetOrCreateCollectionAsync(ct); |
| | 36 | 213 | | var document = await GetDocumentByReferenceAsync(reference, collectionId, false, false, ct); |
| | 36 | 214 | | return document.Ids.Length != 0; |
| | 36 | 215 | | } |
| | | 216 | | |
| | | 217 | | private async Task<Document> GetDocumentByReferenceAsync(string reference, string collectionId, |
| | | 218 | | bool includeEmbeddings, bool includeMetadata, CancellationToken ct) |
| | | 219 | | { |
| | 51 | 220 | | var include = new List<string>(); |
| | 57 | 221 | | if (includeEmbeddings) include.Add("embeddings"); |
| | 60 | 222 | | if (includeMetadata) include.Add("metadatas"); |
| | 51 | 223 | | var content = CreateStringContent(new { Ids = new[] { reference }, Include = include }); |
| | 51 | 224 | | var response = await SendPostRequestAsync( |
| | 51 | 225 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/get", content, ct); |
| | 51 | 226 | | var responseContent = await response.Content.ReadAsStreamAsync(ct); |
| | 51 | 227 | | var document = |
| | 51 | 228 | | await JsonSerializer.DeserializeAsync<Document>(responseContent, SerializerDefaults.JsonOptions, ct); |
| | 51 | 229 | | if (document is null || (includeEmbeddings && document.Embeddings is null)) |
| | | 230 | | { |
| | 0 | 231 | | throw await CreateDatabaseOperationExceptionAsync("Could not get entry", response, ct); |
| | | 232 | | } |
| | 51 | 233 | | return document; |
| | 51 | 234 | | } |
| | | 235 | | |
| | | 236 | | private async Task<string> GetOrCreateCollectionAsync(CancellationToken ct) |
| | | 237 | | { |
| | 51 | 238 | | await CreateDatabaseIfNotExistsAsync(ct); |
| | 51 | 239 | | var existingCollectionId = await GetCollectionIdAsync(_collectionName, ct); |
| | 95 | 240 | | if (existingCollectionId is not null) return existingCollectionId; |
| | | 241 | | |
| | 7 | 242 | | var requestContent = CreateStringContent(new CollectionDefinition(_collectionName)); |
| | 7 | 243 | | var response = |
| | 7 | 244 | | await SendPostRequestAsync($"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections", |
| | 7 | 245 | | requestContent, ct); |
| | | 246 | | |
| | 7 | 247 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 248 | | { |
| | 0 | 249 | | throw await CreateDatabaseOperationExceptionAsync("Could not create collection", response, ct); |
| | | 250 | | } |
| | 7 | 251 | | return await ExtractCollectionIdAsync(response, ct); |
| | 51 | 252 | | } |
| | | 253 | | |
| | | 254 | | private async Task<string?> GetCollectionIdAsync(string collectionName, CancellationToken ct) |
| | | 255 | | { |
| | 51 | 256 | | var response = await SendGetRequestAsync( |
| | 51 | 257 | | $"api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionName}", ct); |
| | 51 | 258 | | return response.StatusCode == HttpStatusCode.NotFound ? null : await ExtractCollectionIdAsync(response, ct); |
| | 51 | 259 | | } |
| | | 260 | | |
| | | 261 | | private static async Task<string> ExtractCollectionIdAsync(HttpResponseMessage response, CancellationToken ct) |
| | | 262 | | { |
| | 51 | 263 | | var content = await response.Content.ReadAsStreamAsync(ct); |
| | 51 | 264 | | var collection = |
| | 51 | 265 | | await JsonSerializer.DeserializeAsync<Collection>(content, SerializerDefaults.JsonOptions, ct); |
| | 51 | 266 | | if (collection is null) |
| | | 267 | | { |
| | 0 | 268 | | throw await CreateDatabaseOperationExceptionAsync("Could not extract collection ID", response, ct); |
| | | 269 | | } |
| | 51 | 270 | | return collection.Id; |
| | 51 | 271 | | } |
| | | 272 | | |
| | | 273 | | private async Task CreateDatabaseIfNotExistsAsync(CancellationToken ct) |
| | | 274 | | { |
| | 51 | 275 | | await CreateTenantIfNotExistsAsync(ct); |
| | 99 | 276 | | if (await DatabaseExistsAsync(ct)) return; |
| | 3 | 277 | | var content = CreateStringContent(new { Name = _databaseName }); |
| | 3 | 278 | | var response = await SendPostRequestAsync($"/api/v2/tenants/{_tenantName}/databases", content, ct); |
| | 3 | 279 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 280 | | { |
| | 0 | 281 | | throw await CreateDatabaseOperationExceptionAsync("Could not create database", response, ct); |
| | | 282 | | } |
| | 51 | 283 | | } |
| | | 284 | | |
| | | 285 | | private async Task<bool> DatabaseExistsAsync(CancellationToken ct) |
| | | 286 | | { |
| | 51 | 287 | | var response = await SendGetRequestAsync($"api/v2/tenants/{_tenantName}/databases/{_databaseName}", ct); |
| | 51 | 288 | | return response.StatusCode == HttpStatusCode.OK; |
| | 51 | 289 | | } |
| | | 290 | | |
| | | 291 | | private async Task CreateTenantIfNotExistsAsync(CancellationToken ct) |
| | | 292 | | { |
| | 99 | 293 | | if (await TenantExistsAsync(ct)) return; |
| | 3 | 294 | | var content = CreateStringContent(new { Name = _tenantName }); |
| | 3 | 295 | | var response = await SendPostRequestAsync("/api/v2/tenants", content, ct); |
| | 3 | 296 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 297 | | { |
| | 0 | 298 | | throw await CreateDatabaseOperationExceptionAsync("Could not create tenant", response, ct); |
| | | 299 | | } |
| | 51 | 300 | | } |
| | | 301 | | |
| | | 302 | | private async Task<bool> TenantExistsAsync(CancellationToken ct) |
| | | 303 | | { |
| | 51 | 304 | | var response = await SendGetRequestAsync($"api/v2/tenants/{_tenantName}", ct); |
| | 51 | 305 | | return response.StatusCode == HttpStatusCode.OK; |
| | 51 | 306 | | } |
| | | 307 | | |
| | | 308 | | private Task<HttpResponseMessage> SendGetRequestAsync(string relativeUri, CancellationToken ct) => |
| | 153 | 309 | | SendRequestAsync(HttpMethod.Get, relativeUri, ct); |
| | | 310 | | |
| | | 311 | | private Task<HttpResponseMessage> SendPostRequestAsync(string relativeUri, HttpContent content, |
| | 100 | 312 | | CancellationToken ct) => SendRequestAsync(HttpMethod.Post, relativeUri, ct, content); |
| | | 313 | | |
| | | 314 | | private async Task<HttpResponseMessage> SendRequestAsync(HttpMethod method, string relativeUri, |
| | | 315 | | CancellationToken ct, HttpContent? content = null) |
| | | 316 | | { |
| | 253 | 317 | | var uri = new Uri(_chromaDbUri, relativeUri); |
| | 253 | 318 | | var request = CreateHttpRequestMessage(method, uri, content); |
| | 253 | 319 | | return await _httpClient.SendAsync(request, ct); |
| | 253 | 320 | | } |
| | | 321 | | |
| | | 322 | | private static StringContent CreateStringContent(object objectToSerialize) => |
| | 100 | 323 | | new(JsonSerializer.Serialize(objectToSerialize, SerializerDefaults.JsonOptions), Encoding.UTF8, |
| | 100 | 324 | | "application/json"); |
| | | 325 | | |
| | | 326 | | private HttpRequestMessage CreateHttpRequestMessage(HttpMethod method, Uri uri, HttpContent? content = null) |
| | | 327 | | { |
| | 253 | 328 | | var request = new HttpRequestMessage(method, uri); |
| | 253 | 329 | | request.Headers.Add(_credentialsHeaderName, _serverAuthnCredentials); |
| | 253 | 330 | | request.Content = content; |
| | 253 | 331 | | return request; |
| | | 332 | | } |
| | | 333 | | |
| | | 334 | | private static async Task<DatabaseOperationException> CreateDatabaseOperationExceptionAsync(string message, |
| | | 335 | | HttpResponseMessage response, CancellationToken ct) |
| | | 336 | | { |
| | 0 | 337 | | var responseContent = await response.Content.ReadAsStringAsync(ct); |
| | 0 | 338 | | return new DatabaseOperationException($"{message}. Code: {response.StatusCode}. Response: {responseContent}"); |
| | 0 | 339 | | } |
| | | 340 | | |
| | | 341 | | private static void ValidateReference(string reference) |
| | | 342 | | { |
| | 57 | 343 | | if (reference.Length is 0 or >= 1000000) throw new ArgumentException("Reference is invalid."); |
| | 45 | 344 | | } |
| | | 345 | | |
| | | 346 | | private static void ValidateSearchQuery(string query) |
| | | 347 | | { |
| | 0 | 348 | | if (query.Length is 0 or >= 1000000) throw new ArgumentException("Query is invalid."); |
| | 0 | 349 | | } |
| | | 350 | | } |