| | | 1 | | using System.Net; |
| | | 2 | | using System.Text; |
| | | 3 | | using System.Text.Json; |
| | | 4 | | using GistBackend.Exceptions; |
| | | 5 | | using GistBackend.Handlers.AIHandler; |
| | | 6 | | using GistBackend.Types; |
| | | 7 | | using GistBackend.Utils; |
| | | 8 | | using Microsoft.Extensions.Logging; |
| | | 9 | | using Microsoft.Extensions.Options; |
| | | 10 | | using static GistBackend.Utils.LogEvents; |
| | | 11 | | |
| | | 12 | | namespace GistBackend.Handlers.ChromaDbHandler; |
| | | 13 | | |
| | | 14 | | public interface IChromaDbHandler |
| | | 15 | | { |
| | | 16 | | Task UpsertEntryAsync(RssEntry entry, string summary, CancellationToken ct); |
| | | 17 | | Task<bool> EnsureGistHasCorrectMetadataAsync(Gist gist, bool disabled, CancellationToken ct); |
| | | 18 | | Task<List<SimilarDocument>> GetReferenceAndScoreOfSimilarEntriesAsync( |
| | | 19 | | string reference, int nResults, IEnumerable<int> disabledFeedIds, CancellationToken ct); |
| | | 20 | | } |
| | | 21 | | |
| | | 22 | | public class ChromaDbHandler : IChromaDbHandler |
| | | 23 | | { |
| | | 24 | | private readonly Uri _chromaDbUri; |
| | | 25 | | private readonly string _tenantName; |
| | | 26 | | private readonly string _databaseName; |
| | | 27 | | private readonly string _collectionName; |
| | | 28 | | private readonly IAIHandler _aiHandler; |
| | | 29 | | private readonly HttpClient _httpClient; |
| | | 30 | | private readonly string _credentialsHeaderName; |
| | | 31 | | private readonly string _serverAuthnCredentials; |
| | | 32 | | private readonly ILogger<ChromaDbHandler>? _logger; |
| | | 33 | | |
| | 28 | 34 | | public ChromaDbHandler(IAIHandler aiHandler, |
| | 28 | 35 | | HttpClient httpClient, |
| | 28 | 36 | | IOptions<ChromaDbHandlerOptions> options, |
| | 28 | 37 | | ILogger<ChromaDbHandler>? logger) |
| | | 38 | | { |
| | 28 | 39 | | if (string.IsNullOrWhiteSpace(options.Value.Server)) |
| | 0 | 40 | | throw new ArgumentException("Server is not set in the options."); |
| | 28 | 41 | | if (string.IsNullOrWhiteSpace(options.Value.ServerAuthnCredentials)) |
| | 0 | 42 | | throw new ArgumentException("Server authentication credentials are not set in the options."); |
| | 28 | 43 | | _aiHandler = aiHandler; |
| | 28 | 44 | | _httpClient = httpClient; |
| | 28 | 45 | | _logger = logger; |
| | 28 | 46 | | _chromaDbUri = new Uri($"http://{options.Value.Server}:{options.Value.Port}/"); |
| | 28 | 47 | | _credentialsHeaderName = options.Value.CredentialsHeaderName; |
| | 28 | 48 | | _serverAuthnCredentials = options.Value.ServerAuthnCredentials; |
| | 28 | 49 | | _tenantName = options.Value.GistsTenantName; |
| | 28 | 50 | | _databaseName = options.Value.GistsDatabaseName; |
| | 28 | 51 | | _collectionName = options.Value.GistsCollectionName; |
| | 28 | 52 | | } |
| | | 53 | | |
| | 1 | 54 | | private static readonly string[] IncludeOnGet = ["metadatas", "distances"]; |
| | | 55 | | |
| | | 56 | | public async Task<List<SimilarDocument>> GetReferenceAndScoreOfSimilarEntriesAsync(string reference, |
| | | 57 | | int nResults, IEnumerable<int> disabledFeedIds, CancellationToken ct) |
| | | 58 | | { |
| | 6 | 59 | | ValidateReference(reference); |
| | 6 | 60 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 6 | 61 | | if (!await EntryExistsByReferenceAsync(reference, ct, collectionId)) |
| | | 62 | | { |
| | 0 | 63 | | throw new DatabaseOperationException("Entry does not exist in database"); |
| | | 64 | | } |
| | | 65 | | |
| | 6 | 66 | | var document = await GetDocumentByReferenceAsync(reference, collectionId, true, false, ct); |
| | 6 | 67 | | var content = CreateStringContent(new { |
| | 6 | 68 | | QueryEmbeddings = new[] {document.Embeddings!.Single()}, |
| | 6 | 69 | | NResults = nResults+1, // +1 to exclude the original entry |
| | 6 | 70 | | Where = GenerateWhere(disabledFeedIds), |
| | 6 | 71 | | Include = IncludeOnGet |
| | 6 | 72 | | }); |
| | 6 | 73 | | var response = await SendPostRequestAsync( |
| | 6 | 74 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/query", content, ct); |
| | 6 | 75 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 76 | | { |
| | 0 | 77 | | throw await CreateDatabaseOperationExceptionAsync("Could not query similar entries", response, ct); |
| | | 78 | | } |
| | | 79 | | |
| | 6 | 80 | | var responseContent = await response.Content.ReadAsStreamAsync(ct); |
| | 6 | 81 | | var queryResponse = |
| | 6 | 82 | | await JsonSerializer.DeserializeAsync<QueryResponse>(responseContent, SerializerDefaults.JsonOptions, ct); |
| | 6 | 83 | | if (queryResponse is null) throw new DatabaseOperationException("Could not get similar entries"); |
| | 6 | 84 | | var referencesAndScores = ExtractReferencesAndScores(queryResponse); |
| | | 85 | | |
| | | 86 | | // Exclude the original entry from the results |
| | 20 | 87 | | return referencesAndScores.Where(referenceAndScore => referenceAndScore.Reference != reference).ToList(); |
| | 6 | 88 | | } |
| | | 89 | | |
| | | 90 | | private static Dictionary<string, object> GenerateWhere(IEnumerable<int> disabledFeedIds) |
| | | 91 | | { |
| | 6 | 92 | | var whereNotDisabled = new Dictionary<string, object> { |
| | 6 | 93 | | { "disabled", new Dictionary<string, object> { { "$ne", true } } } |
| | 6 | 94 | | }; |
| | 6 | 95 | | var disabledFeedIdsArray = disabledFeedIds.ToArray(); |
| | 6 | 96 | | if (disabledFeedIdsArray.Length == 0) |
| | | 97 | | { |
| | 4 | 98 | | return whereNotDisabled; |
| | | 99 | | } |
| | | 100 | | |
| | 2 | 101 | | var whereNotInDisabledFeeds = new Dictionary<string, object> { |
| | 2 | 102 | | { "feed_id", new Dictionary<string, object> { { "$nin", disabledFeedIdsArray } } } |
| | 2 | 103 | | }; |
| | 2 | 104 | | return new Dictionary<string, object> { |
| | 2 | 105 | | { "$and", new[] { |
| | 2 | 106 | | whereNotDisabled, |
| | 2 | 107 | | whereNotInDisabledFeeds |
| | 2 | 108 | | } } |
| | 2 | 109 | | }; |
| | | 110 | | } |
| | | 111 | | |
| | | 112 | | private static List<SimilarDocument> ExtractReferencesAndScores(QueryResponse queryResponse) => |
| | 6 | 113 | | Enumerable.Range(0, queryResponse.Ids.First().Length).Select(i => |
| | 14 | 114 | | new SimilarDocument( |
| | 14 | 115 | | queryResponse.Metadatas.First()[i].Reference, |
| | 14 | 116 | | ConvertCosineDistanceToSimilarity(queryResponse.Distances.First()[i]) |
| | 14 | 117 | | )) |
| | 6 | 118 | | .ToList(); |
| | | 119 | | |
| | 14 | 120 | | private static float ConvertCosineDistanceToSimilarity(float distance) => float.Clamp(1 - distance/2, 0, 1); |
| | | 121 | | |
| | | 122 | | public async Task UpsertEntryAsync(RssEntry entry, string summary, CancellationToken ct) |
| | | 123 | | { |
| | 27 | 124 | | ValidateReference(entry.Reference); |
| | 25 | 125 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 25 | 126 | | var mode = "add"; |
| | 25 | 127 | | if (await EntryExistsByReferenceAsync(entry.Reference, ct, collectionId)) |
| | | 128 | | { |
| | 1 | 129 | | _logger?.LogInformation(EntryAlreadyExistsInChromaDb, |
| | 1 | 130 | | "Entry with reference {Reference} already exists in database", entry.Reference); |
| | 1 | 131 | | mode = "update"; |
| | | 132 | | } |
| | | 133 | | |
| | 25 | 134 | | var metadata = new Metadata(entry.Reference, entry.FeedId); |
| | 25 | 135 | | var embedding = await _aiHandler.GenerateEmbeddingAsync(summary, ct); |
| | 25 | 136 | | var content = CreateStringContent(new Document([entry.Reference], [metadata], [embedding])); |
| | 25 | 137 | | var response = await SendPostRequestAsync( |
| | 25 | 138 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/{mode}", content, ct); |
| | | 139 | | |
| | 25 | 140 | | if (mode == "add" && response.StatusCode != HttpStatusCode.Created || |
| | 25 | 141 | | mode == "update" && response.StatusCode != HttpStatusCode.OK) |
| | | 142 | | { |
| | 0 | 143 | | throw await CreateDatabaseOperationExceptionAsync($"Could not {mode} entry", response, ct); |
| | | 144 | | } |
| | 25 | 145 | | _logger?.LogInformation(DocumentInserted, |
| | 25 | 146 | | "Upserted ({Mode}) document with metadata {Metadata} for entry with reference {Reference}", |
| | 25 | 147 | | mode, metadata, entry.Reference); |
| | 25 | 148 | | } |
| | | 149 | | |
| | | 150 | | public async Task<bool> EnsureGistHasCorrectMetadataAsync(Gist gist, bool disabled, CancellationToken ct) |
| | | 151 | | { |
| | 13 | 152 | | ValidateReference(gist.Reference); |
| | 9 | 153 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 9 | 154 | | var document = await GetDocumentByReferenceAsync(gist.Reference, collectionId, false, true, ct); |
| | 9 | 155 | | var oldMetadata = document.Metadatas.FirstOrDefault(); |
| | 9 | 156 | | if (oldMetadata is null) |
| | | 157 | | { |
| | 2 | 158 | | throw new DatabaseOperationException($"Entry with reference {gist.Reference} does not exist in ChromaDb"); |
| | | 159 | | } |
| | 9 | 160 | | if (oldMetadata.Disabled == disabled && oldMetadata.FeedId == gist.FeedId) return true; |
| | 5 | 161 | | var newMetaData = new Metadata(gist.Reference, gist.FeedId, disabled); |
| | 5 | 162 | | await UpdateMetadataAsync(gist.Reference, newMetaData, ct); |
| | 5 | 163 | | _logger?.LogInformation(ChangedMetadataOfGistInChromaDb, |
| | 5 | 164 | | "Changed metadata from {OldMetadata} to {NewMetadata} for gist with reference {GistReference}", |
| | 5 | 165 | | oldMetadata, newMetaData, gist.Reference); |
| | 5 | 166 | | return false; |
| | 7 | 167 | | } |
| | | 168 | | |
| | | 169 | | private async Task UpdateMetadataAsync(string reference, Metadata metadata, CancellationToken ct) |
| | | 170 | | { |
| | 5 | 171 | | ValidateReference(reference); |
| | 5 | 172 | | var collectionId = await GetOrCreateCollectionAsync(ct); |
| | 5 | 173 | | if (!await EntryExistsByReferenceAsync(reference, ct, collectionId)) |
| | | 174 | | { |
| | 0 | 175 | | throw new DatabaseOperationException("Entry to update does not exist"); |
| | | 176 | | } |
| | 5 | 177 | | var content = CreateStringContent(new Document([reference], [metadata])); |
| | 5 | 178 | | var response = await SendPostRequestAsync( |
| | 5 | 179 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/update", content, ct); |
| | | 180 | | |
| | 5 | 181 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 182 | | { |
| | 0 | 183 | | throw await CreateDatabaseOperationExceptionAsync("Could not update entry", response, ct); |
| | | 184 | | } |
| | 5 | 185 | | } |
| | | 186 | | |
| | | 187 | | public async Task<bool> EntryExistsByReferenceAsync(string reference, CancellationToken ct, string? collectionId = n |
| | | 188 | | { |
| | 36 | 189 | | collectionId ??= await GetOrCreateCollectionAsync(ct); |
| | 36 | 190 | | var document = await GetDocumentByReferenceAsync(reference, collectionId, false, false, ct); |
| | 36 | 191 | | return document.Ids.Length != 0; |
| | 36 | 192 | | } |
| | | 193 | | |
| | | 194 | | private async Task<Document> GetDocumentByReferenceAsync(string reference, string collectionId, |
| | | 195 | | bool includeEmbeddings, bool includeMetadata, CancellationToken ct) |
| | | 196 | | { |
| | 51 | 197 | | var include = new List<string>(); |
| | 57 | 198 | | if (includeEmbeddings) include.Add("embeddings"); |
| | 60 | 199 | | if (includeMetadata) include.Add("metadatas"); |
| | 51 | 200 | | var content = CreateStringContent(new { Ids = new[] { reference }, Include = include }); |
| | 51 | 201 | | var response = await SendPostRequestAsync( |
| | 51 | 202 | | $"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionId}/get", content, ct); |
| | 51 | 203 | | var responseContent = await response.Content.ReadAsStreamAsync(ct); |
| | 51 | 204 | | var document = |
| | 51 | 205 | | await JsonSerializer.DeserializeAsync<Document>(responseContent, SerializerDefaults.JsonOptions, ct); |
| | 51 | 206 | | if (document is null || (includeEmbeddings && document.Embeddings is null)) |
| | | 207 | | { |
| | 0 | 208 | | throw await CreateDatabaseOperationExceptionAsync("Could not get entry", response, ct); |
| | | 209 | | } |
| | 51 | 210 | | return document; |
| | 51 | 211 | | } |
| | | 212 | | |
| | | 213 | | private async Task<string> GetOrCreateCollectionAsync(CancellationToken ct) |
| | | 214 | | { |
| | 45 | 215 | | await CreateDatabaseIfNotExistsAsync(ct); |
| | 45 | 216 | | var existingCollectionId = await GetCollectionIdAsync(_collectionName, ct); |
| | 83 | 217 | | if (existingCollectionId is not null) return existingCollectionId; |
| | | 218 | | |
| | 7 | 219 | | var requestContent = CreateStringContent(new CollectionDefinition(_collectionName)); |
| | 7 | 220 | | var response = |
| | 7 | 221 | | await SendPostRequestAsync($"/api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections", |
| | 7 | 222 | | requestContent, ct); |
| | | 223 | | |
| | 7 | 224 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 225 | | { |
| | 0 | 226 | | throw await CreateDatabaseOperationExceptionAsync("Could not create collection", response, ct); |
| | | 227 | | } |
| | 7 | 228 | | return await ExtractCollectionIdAsync(response, ct); |
| | 45 | 229 | | } |
| | | 230 | | |
| | | 231 | | private async Task<string?> GetCollectionIdAsync(string collectionName, CancellationToken ct) |
| | | 232 | | { |
| | 45 | 233 | | var response = await SendGetRequestAsync( |
| | 45 | 234 | | $"api/v2/tenants/{_tenantName}/databases/{_databaseName}/collections/{collectionName}", ct); |
| | 45 | 235 | | return response.StatusCode == HttpStatusCode.NotFound ? null : await ExtractCollectionIdAsync(response, ct); |
| | 45 | 236 | | } |
| | | 237 | | |
| | | 238 | | private static async Task<string> ExtractCollectionIdAsync(HttpResponseMessage response, CancellationToken ct) |
| | | 239 | | { |
| | 45 | 240 | | var content = await response.Content.ReadAsStreamAsync(ct); |
| | 45 | 241 | | var collection = |
| | 45 | 242 | | await JsonSerializer.DeserializeAsync<Collection>(content, SerializerDefaults.JsonOptions, ct); |
| | 45 | 243 | | if (collection is null) |
| | | 244 | | { |
| | 0 | 245 | | throw await CreateDatabaseOperationExceptionAsync("Could not extract collection ID", response, ct); |
| | | 246 | | } |
| | 45 | 247 | | return collection.Id; |
| | 45 | 248 | | } |
| | | 249 | | |
| | | 250 | | private async Task CreateDatabaseIfNotExistsAsync(CancellationToken ct) |
| | | 251 | | { |
| | 45 | 252 | | await CreateTenantIfNotExistsAsync(ct); |
| | 87 | 253 | | if (await DatabaseExistsAsync(ct)) return; |
| | 3 | 254 | | var content = CreateStringContent(new { Name = _databaseName }); |
| | 3 | 255 | | var response = await SendPostRequestAsync($"/api/v2/tenants/{_tenantName}/databases", content, ct); |
| | 3 | 256 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 257 | | { |
| | 0 | 258 | | throw await CreateDatabaseOperationExceptionAsync("Could not create database", response, ct); |
| | | 259 | | } |
| | 45 | 260 | | } |
| | | 261 | | |
| | | 262 | | private async Task<bool> DatabaseExistsAsync(CancellationToken ct) |
| | | 263 | | { |
| | 45 | 264 | | var response = await SendGetRequestAsync($"api/v2/tenants/{_tenantName}/databases/{_databaseName}", ct); |
| | 45 | 265 | | return response.StatusCode == HttpStatusCode.OK; |
| | 45 | 266 | | } |
| | | 267 | | |
| | | 268 | | private async Task CreateTenantIfNotExistsAsync(CancellationToken ct) |
| | | 269 | | { |
| | 87 | 270 | | if (await TenantExistsAsync(ct)) return; |
| | 3 | 271 | | var content = CreateStringContent(new { Name = _tenantName }); |
| | 3 | 272 | | var response = await SendPostRequestAsync("/api/v2/tenants", content, ct); |
| | 3 | 273 | | if (response.StatusCode != HttpStatusCode.OK) |
| | | 274 | | { |
| | 0 | 275 | | throw await CreateDatabaseOperationExceptionAsync("Could not create tenant", response, ct); |
| | | 276 | | } |
| | 45 | 277 | | } |
| | | 278 | | |
| | | 279 | | private async Task<bool> TenantExistsAsync(CancellationToken ct) |
| | | 280 | | { |
| | 45 | 281 | | var response = await SendGetRequestAsync($"api/v2/tenants/{_tenantName}", ct); |
| | 45 | 282 | | return response.StatusCode == HttpStatusCode.OK; |
| | 45 | 283 | | } |
| | | 284 | | |
| | | 285 | | private Task<HttpResponseMessage> SendGetRequestAsync(string relativeUri, CancellationToken ct) => |
| | 135 | 286 | | SendRequestAsync(HttpMethod.Get, relativeUri, ct); |
| | | 287 | | |
| | | 288 | | private Task<HttpResponseMessage> SendPostRequestAsync(string relativeUri, HttpContent content, |
| | 100 | 289 | | CancellationToken ct) => SendRequestAsync(HttpMethod.Post, relativeUri, ct, content); |
| | | 290 | | |
| | | 291 | | private async Task<HttpResponseMessage> SendRequestAsync(HttpMethod method, string relativeUri, |
| | | 292 | | CancellationToken ct, HttpContent? content = null) |
| | | 293 | | { |
| | 235 | 294 | | var uri = new Uri(_chromaDbUri, relativeUri); |
| | 235 | 295 | | var request = CreateHttpRequestMessage(method, uri, content); |
| | 235 | 296 | | return await _httpClient.SendAsync(request, ct); |
| | 235 | 297 | | } |
| | | 298 | | |
| | | 299 | | private static StringContent CreateStringContent(object objectToSerialize) => |
| | 100 | 300 | | new(JsonSerializer.Serialize(objectToSerialize, SerializerDefaults.JsonOptions), Encoding.UTF8, |
| | 100 | 301 | | "application/json"); |
| | | 302 | | |
| | | 303 | | private HttpRequestMessage CreateHttpRequestMessage(HttpMethod method, Uri uri, HttpContent? content = null) |
| | | 304 | | { |
| | 235 | 305 | | var request = new HttpRequestMessage(method, uri); |
| | 235 | 306 | | request.Headers.Add(_credentialsHeaderName, _serverAuthnCredentials); |
| | 235 | 307 | | request.Content = content; |
| | 235 | 308 | | return request; |
| | | 309 | | } |
| | | 310 | | |
| | | 311 | | private static async Task<DatabaseOperationException> CreateDatabaseOperationExceptionAsync(string message, |
| | | 312 | | HttpResponseMessage response, CancellationToken ct) |
| | | 313 | | { |
| | 0 | 314 | | var responseContent = await response.Content.ReadAsStringAsync(ct); |
| | 0 | 315 | | return new DatabaseOperationException($"{message}. Code: {response.StatusCode}. Response: {responseContent}"); |
| | 0 | 316 | | } |
| | | 317 | | |
| | | 318 | | private static void ValidateReference(string reference) |
| | | 319 | | { |
| | 57 | 320 | | if (reference.Length is 0 or >= 1000000) throw new ArgumentException("Reference is invalid."); |
| | 45 | 321 | | } |
| | | 322 | | } |