Przeglądaj źródła

[fix] On similarity search, check that all embeddings have been found

jherve 1 rok temu
rodzic
commit
8a996adeb7

+ 12 - 0
src/de_quoi_parle_le_monde/similarity_search.py

@@ -1,4 +1,5 @@
 from typing import Callable
+from loguru import logger
 import faiss
 import numpy as np
 
@@ -26,6 +27,17 @@ class SimilaritySearch:
         score_func: Callable[[float], bool],
     ):
         embeds = await self.storage.get_article_embedding(featured_article_snapshot_ids)
+
+        if (nb_embeds := len(embeds)) != (
+            nb_articles := len(featured_article_snapshot_ids)
+        ):
+            msg = (
+                f"Expected {nb_articles} embedding(s) in storage but found only {nb_embeds}. "
+                "A plausible cause is that they have not been computed yet"
+            )
+            logger.error(msg)
+            raise ValueError(msg)
+
         all_titles = np.array([e["title_embedding"] for e in embeds])
         faiss.normalize_L2(all_titles)
         D, I = self.index.search(np.array(all_titles), nb_results)

+ 10 - 5
src/de_quoi_parle_le_monde/web.py

@@ -55,11 +55,16 @@ async def site_main_article_snapshot(
     same_site_articles = [
         a for a in main_articles if a["site_id"] == id and a["time_diff"] != 0
     ]
-    [(_, similar)] = await sim_index.search(
-        [focused_article["featured_article_snapshot_id"]],
-        20,
-        lambda s: s < 1.0 and s >= 0.5,
-    )
+
+    try:
+        [(_, similar)] = await sim_index.search(
+            [focused_article["featured_article_snapshot_id"]],
+            20,
+            lambda s: s < 1.0 and s >= 0.5,
+        )
+    except ValueError:
+        similar = []
+
     similar_by_id = {s[0]: s[1] for s in similar}
     similar_articles = await storage.list_featured_article_snapshots(
         list(similar_by_id.keys())