ソースを参照

Simplify database schema

jherve 1 年間 前
コミット
6d305a47bb

+ 3 - 1
pyproject.toml

@@ -45,7 +45,9 @@ build-backend = "hatchling.build"
 
 [tool.rye]
 managed = true
-dev-dependencies = []
+dev-dependencies = [
+    "ipython>=8.25.0",
+]
 
 [tool.hatch.metadata]
 allow-direct-references = true

+ 144 - 25
requirements-dev.lock

@@ -3,7 +3,7 @@
 #
 # last locked with the following flags:
 #   pre: false
-#   features: []
+#   features: ["embeddings"]
 #   all-features: false
 #   with-sources: false
 
@@ -18,13 +18,13 @@ aiofiles==23.2.1
 aiohttp==3.9.5
     # via aiobotocore
     # via aiohttp-client-cache
-    # via de-quoi-parle-le-monde
+    # via media-observer
 aiohttp-client-cache==0.11.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 aioitertools==0.11.0
     # via aiobotocore
 aiolimiter==1.1.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 aiosignal==1.3.1
     # via aiohttp
 aiosqlite==0.20.0
@@ -32,25 +32,27 @@ aiosqlite==0.20.0
 annotated-types==0.6.0
     # via pydantic
 annoy==1.17.3
-    # via de-quoi-parle-le-monde
+    # via media-observer
 anyio==4.3.0
     # via httpx
     # via starlette
     # via watchfiles
+asttokens==2.4.1
+    # via stack-data
 async-timeout==4.0.3
     # via asyncpg
 asyncpg==0.29.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 attrs==23.2.0
     # via aiohttp
     # via aiohttp-client-cache
     # via cattrs
-    # via de-quoi-parle-le-monde
+    # via media-observer
     # via requests-cache
 babel==2.15.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 beautifulsoup4==4.12.3
-    # via de-quoi-parle-le-monde
+    # via media-observer
 boto3==1.34.69
     # via aiobotocore
 botocore==1.34.69
@@ -58,7 +60,7 @@ botocore==1.34.69
     # via boto3
     # via s3transfer
 cattrs==23.2.3
-    # via de-quoi-parle-le-monde
+    # via media-observer
     # via requests-cache
 certifi==2024.2.2
     # via httpcore
@@ -69,21 +71,33 @@ charset-normalizer==3.3.2
 click==8.1.7
     # via typer
     # via uvicorn
+decorator==5.1.1
+    # via ipython
 dnspython==2.6.1
     # via email-validator
     # via pymongo
 dynaconf==3.2.5
-    # via de-quoi-parle-le-monde
+    # via media-observer
 email-validator==2.1.1
     # via fastapi
+executing==2.0.1
+    # via stack-data
 fastapi==0.111.0
-    # via de-quoi-parle-le-monde
     # via fastapi-cli
+    # via media-observer
 fastapi-cli==0.0.3
     # via fastapi
+filelock==3.14.0
+    # via huggingface-hub
+    # via torch
+    # via transformers
+    # via triton
 frozenlist==1.4.1
     # via aiohttp
     # via aiosignal
+fsspec==2024.6.0
+    # via huggingface-hub
+    # via torch
 h11==0.14.0
     # via httpcore
     # via hypercorn
@@ -99,10 +113,14 @@ httptools==0.6.1
     # via uvicorn
 httpx==0.27.0
     # via fastapi
+huggingface-hub==0.23.3
+    # via sentence-transformers
+    # via tokenizers
+    # via transformers
 humanize==4.9.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 hypercorn==0.16.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 hyperframe==6.0.1
     # via h2
 idna==3.7
@@ -111,46 +129,105 @@ idna==3.7
     # via httpx
     # via requests
     # via yarl
+ipython==8.25.0
 itsdangerous==2.2.0
     # via aiohttp-client-cache
+jedi==0.19.1
+    # via ipython
 jinja2==3.1.4
-    # via de-quoi-parle-le-monde
     # via fastapi
+    # via media-observer
+    # via torch
 jmespath==1.0.1
     # via boto3
     # via botocore
+joblib==1.4.2
+    # via scikit-learn
 loguru==0.7.2
-    # via de-quoi-parle-le-monde
+    # via media-observer
 lxml==5.2.2
-    # via de-quoi-parle-le-monde
+    # via media-observer
 markdown-it-py==3.0.0
     # via rich
 markupsafe==2.1.5
     # via jinja2
+matplotlib-inline==0.1.7
+    # via ipython
 mdurl==0.1.2
     # via markdown-it-py
 motor==3.4.0
     # via aiohttp-client-cache
+mpmath==1.3.0
+    # via sympy
 multidict==6.0.5
     # via aiohttp
     # via yarl
+networkx==3.3
+    # via torch
 numpy==1.26.4
-    # via de-quoi-parle-le-monde
+    # via media-observer
+    # via scikit-learn
+    # via scipy
+    # via sentence-transformers
+    # via transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via nvidia-cudnn-cu12
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.5.40
+    # via nvidia-cusolver-cu12
+    # via nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 orjson==3.10.3
     # via fastapi
 packaging==24.0
-    # via de-quoi-parle-le-monde
+    # via huggingface-hub
+    # via media-observer
+    # via transformers
+parso==0.8.4
+    # via jedi
+pexpect==4.9.0
+    # via ipython
+pillow==10.3.0
+    # via sentence-transformers
 platformdirs==4.2.1
     # via requests-cache
 priority==2.0.0
     # via hypercorn
+prompt-toolkit==3.0.46
+    # via ipython
 protobuf==5.26.1
-    # via de-quoi-parle-le-monde
+    # via media-observer
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.2
+    # via stack-data
 pydantic==2.7.1
     # via fastapi
 pydantic-core==2.18.2
     # via pydantic
 pygments==2.18.0
+    # via ipython
     # via rich
 pymongo==4.7.2
     # via motor
@@ -161,25 +238,41 @@ python-dotenv==1.0.1
 python-multipart==0.0.9
     # via fastapi
 pyyaml==6.0.1
+    # via huggingface-hub
+    # via transformers
     # via uvicorn
 redis==5.0.4
     # via aiohttp-client-cache
+regex==2024.5.15
+    # via transformers
 requests==2.31.0
-    # via de-quoi-parle-le-monde
+    # via huggingface-hub
+    # via media-observer
     # via requests-cache
+    # via transformers
 requests-cache==1.2.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 rich==13.7.1
     # via typer
 ruff==0.4.4
-    # via de-quoi-parle-le-monde
+    # via media-observer
 s3transfer==0.10.1
     # via boto3
+safetensors==0.4.3
+    # via transformers
+scikit-learn==1.5.0
+    # via sentence-transformers
+scipy==1.13.1
+    # via scikit-learn
+    # via sentence-transformers
+sentence-transformers==3.0.1
+    # via media-observer
 sentencepiece==0.2.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 shellingham==1.5.4
     # via typer
 six==1.16.0
+    # via asttokens
     # via python-dateutil
     # via url-normalize
 sniffio==1.3.1
@@ -187,15 +280,39 @@ sniffio==1.3.1
     # via httpx
 soupsieve==2.5
     # via beautifulsoup4
+stack-data==0.6.3
+    # via ipython
 starlette==0.37.2
     # via fastapi
+sympy==1.12.1
+    # via torch
+threadpoolctl==3.5.0
+    # via scikit-learn
+tokenizers==0.19.1
+    # via transformers
+torch==2.3.1
+    # via sentence-transformers
+tqdm==4.66.4
+    # via huggingface-hub
+    # via sentence-transformers
+    # via transformers
+traitlets==5.14.3
+    # via ipython
+    # via matplotlib-inline
+transformers==4.41.2
+    # via sentence-transformers
+triton==2.3.1
+    # via torch
 typer==0.12.3
     # via fastapi-cli
 typing-extensions==4.11.0
     # via aiosqlite
     # via fastapi
+    # via huggingface-hub
+    # via ipython
     # via pydantic
     # via pydantic-core
+    # via torch
     # via typer
 ujson==5.9.0
     # via fastapi
@@ -213,6 +330,8 @@ uvloop==0.19.0
     # via uvicorn
 watchfiles==0.21.0
     # via uvicorn
+wcwidth==0.2.13
+    # via prompt-toolkit
 websockets==12.0
     # via uvicorn
 wrapt==1.16.0
@@ -221,4 +340,4 @@ wsproto==1.2.0
     # via hypercorn
 yarl==1.9.4
     # via aiohttp
-    # via de-quoi-parle-le-monde
+    # via media-observer

+ 113 - 25
requirements.lock

@@ -3,7 +3,7 @@
 #
 # last locked with the following flags:
 #   pre: false
-#   features: []
+#   features: ["embeddings"]
 #   all-features: false
 #   with-sources: false
 
@@ -18,13 +18,13 @@ aiofiles==23.2.1
 aiohttp==3.9.5
     # via aiobotocore
     # via aiohttp-client-cache
-    # via de-quoi-parle-le-monde
+    # via media-observer
 aiohttp-client-cache==0.11.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 aioitertools==0.11.0
     # via aiobotocore
 aiolimiter==1.1.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 aiosignal==1.3.1
     # via aiohttp
 aiosqlite==0.20.0
@@ -32,7 +32,7 @@ aiosqlite==0.20.0
 annotated-types==0.6.0
     # via pydantic
 annoy==1.17.3
-    # via de-quoi-parle-le-monde
+    # via media-observer
 anyio==4.3.0
     # via httpx
     # via starlette
@@ -40,17 +40,17 @@ anyio==4.3.0
 async-timeout==4.0.3
     # via asyncpg
 asyncpg==0.29.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 attrs==23.2.0
     # via aiohttp
     # via aiohttp-client-cache
     # via cattrs
-    # via de-quoi-parle-le-monde
+    # via media-observer
     # via requests-cache
 babel==2.15.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 beautifulsoup4==4.12.3
-    # via de-quoi-parle-le-monde
+    # via media-observer
 boto3==1.34.69
     # via aiobotocore
 botocore==1.34.69
@@ -58,7 +58,7 @@ botocore==1.34.69
     # via boto3
     # via s3transfer
 cattrs==23.2.3
-    # via de-quoi-parle-le-monde
+    # via media-observer
     # via requests-cache
 certifi==2024.2.2
     # via httpcore
@@ -73,17 +73,25 @@ dnspython==2.6.1
     # via email-validator
     # via pymongo
 dynaconf==3.2.5
-    # via de-quoi-parle-le-monde
+    # via media-observer
 email-validator==2.1.1
     # via fastapi
 fastapi==0.111.0
-    # via de-quoi-parle-le-monde
     # via fastapi-cli
+    # via media-observer
 fastapi-cli==0.0.3
     # via fastapi
+filelock==3.14.0
+    # via huggingface-hub
+    # via torch
+    # via transformers
+    # via triton
 frozenlist==1.4.1
     # via aiohttp
     # via aiosignal
+fsspec==2024.6.0
+    # via huggingface-hub
+    # via torch
 h11==0.14.0
     # via httpcore
     # via hypercorn
@@ -99,10 +107,14 @@ httptools==0.6.1
     # via uvicorn
 httpx==0.27.0
     # via fastapi
+huggingface-hub==0.23.3
+    # via sentence-transformers
+    # via tokenizers
+    # via transformers
 humanize==4.9.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 hypercorn==0.16.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 hyperframe==6.0.1
     # via h2
 idna==3.7
@@ -114,15 +126,18 @@ idna==3.7
 itsdangerous==2.2.0
     # via aiohttp-client-cache
 jinja2==3.1.4
-    # via de-quoi-parle-le-monde
     # via fastapi
+    # via media-observer
+    # via torch
 jmespath==1.0.1
     # via boto3
     # via botocore
+joblib==1.4.2
+    # via scikit-learn
 loguru==0.7.2
-    # via de-quoi-parle-le-monde
+    # via media-observer
 lxml==5.2.2
-    # via de-quoi-parle-le-monde
+    # via media-observer
 markdown-it-py==3.0.0
     # via rich
 markupsafe==2.1.5
@@ -131,21 +146,61 @@ mdurl==0.1.2
     # via markdown-it-py
 motor==3.4.0
     # via aiohttp-client-cache
+mpmath==1.3.0
+    # via sympy
 multidict==6.0.5
     # via aiohttp
     # via yarl
+networkx==3.3
+    # via torch
 numpy==1.26.4
-    # via de-quoi-parle-le-monde
+    # via media-observer
+    # via scikit-learn
+    # via scipy
+    # via sentence-transformers
+    # via transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via nvidia-cudnn-cu12
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.5.40
+    # via nvidia-cusolver-cu12
+    # via nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 orjson==3.10.3
     # via fastapi
 packaging==24.0
-    # via de-quoi-parle-le-monde
+    # via huggingface-hub
+    # via media-observer
+    # via transformers
+pillow==10.3.0
+    # via sentence-transformers
 platformdirs==4.2.1
     # via requests-cache
 priority==2.0.0
     # via hypercorn
 protobuf==5.26.1
-    # via de-quoi-parle-le-monde
+    # via media-observer
 pydantic==2.7.1
     # via fastapi
 pydantic-core==2.18.2
@@ -161,22 +216,37 @@ python-dotenv==1.0.1
 python-multipart==0.0.9
     # via fastapi
 pyyaml==6.0.1
+    # via huggingface-hub
+    # via transformers
     # via uvicorn
 redis==5.0.4
     # via aiohttp-client-cache
+regex==2024.5.15
+    # via transformers
 requests==2.31.0
-    # via de-quoi-parle-le-monde
+    # via huggingface-hub
+    # via media-observer
     # via requests-cache
+    # via transformers
 requests-cache==1.2.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 rich==13.7.1
     # via typer
 ruff==0.4.4
-    # via de-quoi-parle-le-monde
+    # via media-observer
 s3transfer==0.10.1
     # via boto3
+safetensors==0.4.3
+    # via transformers
+scikit-learn==1.5.0
+    # via sentence-transformers
+scipy==1.13.1
+    # via scikit-learn
+    # via sentence-transformers
+sentence-transformers==3.0.1
+    # via media-observer
 sentencepiece==0.2.0
-    # via de-quoi-parle-le-monde
+    # via media-observer
 shellingham==1.5.4
     # via typer
 six==1.16.0
@@ -189,13 +259,31 @@ soupsieve==2.5
     # via beautifulsoup4
 starlette==0.37.2
     # via fastapi
+sympy==1.12.1
+    # via torch
+threadpoolctl==3.5.0
+    # via scikit-learn
+tokenizers==0.19.1
+    # via transformers
+torch==2.3.1
+    # via sentence-transformers
+tqdm==4.66.4
+    # via huggingface-hub
+    # via sentence-transformers
+    # via transformers
+transformers==4.41.2
+    # via sentence-transformers
+triton==2.3.1
+    # via torch
 typer==0.12.3
     # via fastapi-cli
 typing-extensions==4.11.0
     # via aiosqlite
     # via fastapi
+    # via huggingface-hub
     # via pydantic
     # via pydantic-core
+    # via torch
     # via typer
 ujson==5.9.0
     # via fastapi
@@ -221,4 +309,4 @@ wsproto==1.2.0
     # via hypercorn
 yarl==1.9.4
     # via aiohttp
-    # via de-quoi-parle-le-monde
+    # via media-observer

+ 4 - 12
src/media_observer/embeddings.py

@@ -26,21 +26,13 @@ def batched(iterable, n):
 
 @frozen
 class EmbeddingsJob:
-    article_id: int
+    title_id: int
     text: NDArray
 
     @staticmethod
     async def create(storage: Storage):
-        all_snapshots = await storage.list_all_featured_article_snapshots()
-        all_embeds_ids = set(
-            await storage.list_all_embedded_featured_article_snapshot_ids()
-        )
-
-        all_snapshots_not_stored = (
-            s for s in all_snapshots if s["id"] not in all_embeds_ids
-        )
-
-        return [EmbeddingsJob(s["id"], s["title"]) for s in all_snapshots_not_stored]
+        all_titles = await storage.list_all_titles_without_embedding()
+        return [EmbeddingsJob(t["id"], t["text"]) for t in all_titles]
 
 
 @frozen
@@ -73,7 +65,7 @@ class EmbeddingsWorker:
         batch_size = 64
         for batch in batched(jobs, batch_size):
             embeddings_by_id = self.compute_embeddings_for(
-                {j.article_id: j.text for j in batch}
+                {j.title_id: j.text for j in batch}
             )
             await self.store_embeddings(embeddings_by_id)
 

+ 17 - 19
src/media_observer/similarity_index.py

@@ -17,12 +17,12 @@ file_path_pickle_class = "./similarity.class"
 class SimilaritySearch:
     storage: Storage
     index: AnnoyIndex
-    embedding_to_featured: dict[int, int] = {}
-    featured_to_embedding: dict[int, int] = {}
+    index_id_to_title: dict[int, int] = {}
+    title_to_index_id: dict[int, int] = {}
     instance: ClassVar[Any | None] = None
 
     async def add_embeddings(self):
-        embeds = await self.storage.list_all_articles_embeddings()
+        embeds = await self.storage.list_all_embeddings()
         if not embeds:
             msg = (
                 "Did not find any embeddings in storage. "
@@ -31,41 +31,39 @@ class SimilaritySearch:
             logger.error(msg)
             raise ValueError(msg)
 
-        for e in embeds:
-            self.index.add_item(e["id"], e["title_embedding"])
-            self.embedding_to_featured[e["id"]] = e["featured_article_snapshot_id"]
-            self.featured_to_embedding[e["featured_article_snapshot_id"]] = e["id"]
+        for idx, e in enumerate(embeds):
+            self.index.add_item(idx, e["vector"])
+            self.title_to_index_id[e["title_id"]] = idx
+            self.index_id_to_title[idx] = e["title_id"]
 
         self.index.build(20)
 
     async def search(
         self,
-        featured_article_snapshot_ids: list[int],
+        title_ids: list[int],
         nb_results: int,
         score_func: Callable[[float], bool],
     ):
         try:
-            [embed_id] = [
-                self.featured_to_embedding[id_] for id_ in featured_article_snapshot_ids
-            ]
+            [title_id] = [self.title_to_index_id[id] for id in title_ids]
         except KeyError as e:
             msg = (
-                f"Could not find all embedding(s) in storage for {featured_article_snapshot_ids}. "
+                f"Could not find all embedding(s) in storage for {title_ids}. "
                 "A plausible cause is that they have not been computed yet"
             )
             logger.error(msg)
             raise e
 
         indices, distances = self.index.get_nns_by_item(
-            embed_id, nb_results, include_distances=True
+            title_id, nb_results, include_distances=True
         )
         return [
             (
-                embed_id,
+                title_id,
                 [
-                    (self.embedding_to_featured[i], d)
+                    (self.index_id_to_title[i], d)
                     for i, d in (zip(indices, distances))
-                    if i != embed_id and score_func(d)
+                    if i != title_id and score_func(d)
                 ],
             )
         ]
@@ -82,7 +80,7 @@ class SimilaritySearch:
     async def save(self):
         self.index.save(file_path_index)
         with open(file_path_pickle_class, "wb") as f:
-            pickle.dump((self.embedding_to_featured, self.featured_to_embedding), f)
+            pickle.dump((self.index_id_to_title, self.title_to_index_id), f)
 
     @classmethod
     def load(cls, storage):
@@ -92,10 +90,10 @@ class SimilaritySearch:
             try:
                 index.load(file_path_index)
                 with open(file_path_pickle_class, "rb") as f:
-                    (embedding_to_featured, featured_to_embedding) = pickle.load(f)
+                    (index_to_title, title_to_index) = pickle.load(f)
 
                 cls.instance = SimilaritySearch(
-                    storage, index, embedding_to_featured, featured_to_embedding
+                    storage, index, index_to_title, title_to_index
                 )
             except OSError:
                 logger.warning("Could not find index data")

+ 169 - 161
src/media_observer/storage.py

@@ -46,35 +46,35 @@ class Storage(StorageAbc):
             ],
         ),
         Table(
-            name="featured_articles",
+            name="articles",
             columns=[
                 Column(name="id", primary_key=True),
                 Column(name="url", type_="TEXT"),
             ],
         ),
         Table(
-            name="featured_article_snapshots",
+            name="titles",
             columns=[
                 Column(name="id", primary_key=True),
-                Column(
-                    name="featured_article_id",
-                    references="featured_articles (id) ON DELETE CASCADE",
-                ),
-                Column(name="title", type_="TEXT"),
-                Column(name="url", type_="TEXT"),
+                Column(name="text", type_="TEXT"),
             ],
         ),
         Table(
             name="main_articles",
             columns=[
                 Column(name="id", primary_key=True),
+                Column(name="url", type_="TEXT"),
                 Column(
                     name="snapshot_id",
                     references="snapshots (id) ON DELETE CASCADE",
                 ),
                 Column(
-                    name="featured_article_snapshot_id",
-                    references="featured_article_snapshots (id) ON DELETE CASCADE",
+                    name="article_id",
+                    references="articles (id) ON DELETE CASCADE",
+                ),
+                Column(
+                    name="title_id",
+                    references="titles (id) ON DELETE CASCADE",
                 ),
             ],
         ),
@@ -82,26 +82,28 @@ class Storage(StorageAbc):
             name="top_articles",
             columns=[
                 Column(name="id", primary_key=True),
+                Column(name="url", type_="TEXT"),
+                Column(name="rank", type_="INTEGER"),
                 Column(
                     name="snapshot_id",
                     references="snapshots (id) ON DELETE CASCADE",
                 ),
                 Column(
-                    name="featured_article_snapshot_id",
-                    references="featured_article_snapshots (id) ON DELETE CASCADE",
+                    name="article_id",
+                    references="articles (id) ON DELETE CASCADE",
+                ),
+                Column(
+                    name="title_id",
+                    references="titles (id) ON DELETE CASCADE",
                 ),
-                Column(name="rank", type_="INTEGER"),
             ],
         ),
         Table(
-            name="articles_embeddings",
+            name="embeddings",
             columns=[
                 Column(name="id", primary_key=True),
-                Column(
-                    name="featured_article_snapshot_id",
-                    references="featured_article_snapshots (id) ON DELETE CASCADE",
-                ),
-                Column(name="title_embedding", type_="bytea"),
+                Column(name="title_id", references="titles (id) ON DELETE CASCADE"),
+                Column(name="vector", type_="bytea"),
             ],
         ),
     ]
@@ -135,8 +137,8 @@ class Storage(StorageAbc):
             name="main_page_apparitions",
             column_names=[
                 "id",
-                "featured_article_id",
                 "title",
+                "title_id",
                 "url_archive",
                 "url_article",
                 "main_in_snapshot_id",
@@ -145,18 +147,32 @@ class Storage(StorageAbc):
             ],
             create_stmt="""
                 SELECT
-                    fas.id,
-                    fas.featured_article_id,
-                    fas.title,
-                    fas.url AS url_archive,
-                    fa.url AS url_article,
-                    m.snapshot_id AS main_in_snapshot_id,
-                    t.snapshot_id AS top_in_snapshot_id,
-                    t.rank
-                FROM featured_article_snapshots fas
-                JOIN featured_articles fa ON fa.id = fas.featured_article_id
-                LEFT JOIN main_articles m ON m.featured_article_snapshot_id = fas.id
-                LEFT JOIN top_articles t ON t.featured_article_snapshot_id = fas.id
+                    a.id,
+                    t.text AS title,
+                    t.id AS title_id,
+                    ma.url AS url_archive,
+                    a.url AS url_article,
+                    ma.snapshot_id AS main_in_snapshot_id,
+                    NULL AS top_in_snapshot_id,
+                    NULL AS rank
+                FROM articles a
+                JOIN main_articles ma ON ma.article_id = a.id
+                JOIN titles t ON t.id = ma.title_id
+
+                UNION ALL
+
+                SELECT
+                    a.id,
+                    t.text AS title,
+                    t.id AS title_id,
+                    ta.url AS url_archive,
+                    a.url AS url_article,
+                    NULL AS main_in_snapshot_id,
+                    ta.snapshot_id AS top_in_snapshot_id,
+                    ta.rank
+                FROM articles a
+                JOIN top_articles ta ON ta.article_id = a.id
+                JOIN titles t ON t.id = ta.title_id
                 """,
         ),
         View(
@@ -168,9 +184,9 @@ class Storage(StorageAbc):
                 "site_original_url",
                 "timestamp",
                 "timestamp_virtual",
-                "featured_article_snapshot_id",
-                "featured_article_id",
+                "article_id",
                 "title",
+                "title_id",
                 "url_archive",
                 "url_article",
                 "is_main",
@@ -178,22 +194,22 @@ class Storage(StorageAbc):
             ],
             create_stmt="""
                 SELECT
-                    sv.id as snapshot_id,
+                    sv.id AS snapshot_id,
                     sv.site_id,
                     sv.site_name,
                     sv.site_original_url,
-                    sv.timestamp,
+                    sv."timestamp",
                     sv.timestamp_virtual,
-                    mpa.id AS featured_article_snapshot_id,
-                    mpa.featured_article_id,
+                    mpa.id AS article_id,
                     mpa.title,
+                    mpa.title_id,
                     mpa.url_archive,
                     mpa.url_article,
                     mpa.main_in_snapshot_id IS NOT NULL AS is_main,
                     mpa.rank
                 FROM main_page_apparitions mpa
                 JOIN snapshots_view sv ON sv.id = mpa.main_in_snapshot_id OR sv.id = mpa.top_in_snapshot_id
-                """,
+            """,
         ),
     ]
 
@@ -209,29 +225,29 @@ class Storage(StorageAbc):
             columns=["timestamp_virtual", "site_id"],
         ),
         UniqueIndex(
-            name="main_articles_unique_idx_snapshot_id",
-            table="main_articles",
-            columns=["snapshot_id"],
+            name="articles_unique_url",
+            table="articles",
+            columns=["url"],
         ),
         UniqueIndex(
-            name="featured_articles_unique_url",
-            table="featured_articles",
-            columns=["url"],
+            name="titles_unique_text",
+            table="titles",
+            columns=["text"],
         ),
         UniqueIndex(
-            name="featured_article_snapshots_unique_idx_featured_article_id_url",
-            table="featured_article_snapshots",
-            columns=["featured_article_id", "url"],
+            name="main_articles_unique_idx_snapshot_id_article_id",
+            table="main_articles",
+            columns=["snapshot_id", "article_id"],
         ),
         UniqueIndex(
-            name="top_articles_unique_idx_snapshot_id_rank",
+            name="top_articles_unique_idx_snapshot_id_article_id_rank",
             table="top_articles",
-            columns=["snapshot_id", "rank"],
+            columns=["snapshot_id", "article_id", "rank"],
         ),
         UniqueIndex(
-            name="articles_embeddings_unique_idx_featured_article_snapshot_id",
-            table="articles_embeddings",
-            columns=["featured_article_snapshot_id"],
+            name="embeddings_unique_title_id",
+            table="embeddings",
+            columns=["title_id"],
         ),
     ]
 
@@ -288,91 +304,12 @@ class Storage(StorageAbc):
 
         return exists != []
 
-    async def list_all_featured_article_snapshots(self):
-        async with self.backend.get_connection() as conn:
-            rows = await conn.execute_fetchall(
-                """
-                    SELECT *
-                    FROM featured_article_snapshots
-                """,
-            )
-
-            return [
-                self._from_row(r, self._table_by_name["featured_article_snapshots"])
-                for r in rows
-            ]
-
-    async def list_snapshot_apparitions(self, featured_article_snapshot_ids: list[int]):
-        if len(featured_article_snapshot_ids) == 0:
-            return []
-
-        async with self.backend.get_connection() as conn:
-            rows = await conn.execute_fetchall(
-                f"""
-                    SELECT *
-                    FROM snapshot_apparitions
-                    WHERE featured_article_snapshot_id IN ({self._placeholders(*featured_article_snapshot_ids)})
-                """,
-                *featured_article_snapshot_ids,
-            )
-
-            return [
-                self._from_row(r, self._view_by_name["snapshot_apparitions"])
-                for r in rows
-            ]
-
     @classmethod
     def _from_row(cls, r, table_or_view: Table | View):
         columns = table_or_view.column_names
 
         return {col: r[idx] for idx, col in enumerate(columns)}
 
-    async def list_all_embedded_featured_article_snapshot_ids(self) -> list[int]:
-        async with self.backend.get_connection() as conn:
-            rows = await conn.execute_fetchall(
-                """
-                    SELECT featured_article_snapshot_id
-                    FROM articles_embeddings
-                """,
-            )
-
-            return [r[0] for r in rows]
-
-    async def list_all_articles_embeddings(self):
-        async with self.backend.get_connection() as conn:
-            rows = await conn.execute_fetchall(
-                """
-                    SELECT *
-                    FROM articles_embeddings
-                """,
-            )
-
-            return [self._from_articles_embeddings_row(r) for r in rows]
-
-    @classmethod
-    def _from_articles_embeddings_row(cls, r):
-        [embeds_table] = [t for t in cls.tables if t.name == "articles_embeddings"]
-        d = cls._from_row(r, embeds_table)
-        d.update(title_embedding=np.frombuffer(d["title_embedding"], dtype="float32"))
-
-        return d
-
-    async def add_embedding(self, featured_article_snapshot_id: int, embedding):
-        async with self.backend.get_connection() as conn:
-            await conn.execute_insert(
-                self._insert_stmt(
-                    "articles_embeddings",
-                    ["featured_article_snapshot_id", "title_embedding"],
-                ),
-                featured_article_snapshot_id,
-                embedding,
-            )
-
-    async def list_sites(self):
-        async with self.backend.get_connection() as conn:
-            sites = await conn.execute_fetchall("SELECT * FROM sites")
-            return [self._from_row(s, self._table_by_name["sites"]) for s in sites]
-
     async def list_neighbouring_main_articles(
         self,
         site_id: int,
@@ -431,6 +368,70 @@ class Storage(StorageAbc):
                 for a in main_articles
             ]
 
+    async def list_all_titles_without_embedding(self):
+        async with self.backend.get_connection() as conn:
+            rows = await conn.execute_fetchall("""
+                SELECT t.*
+                FROM public.titles AS t
+                WHERE NOT EXISTS (SELECT 1 FROM embeddings WHERE title_id = t.id)
+            """)
+
+            return [self._from_row(r, self._table_by_name["titles"]) for r in rows]
+
+    async def list_all_embeddings(self):
+        async with self.backend.get_connection() as conn:
+            rows = await conn.execute_fetchall(
+                """
+                    SELECT *
+                    FROM embeddings
+                """,
+            )
+
+            return [self._from_embeddings_row(r) for r in rows]
+
+    async def list_snapshot_apparitions(self, title_ids: list[int]):
+        if len(title_ids) == 0:
+            return []
+
+        async with self.backend.get_connection() as conn:
+            rows = await conn.execute_fetchall(
+                f"""
+                    SELECT *
+                    FROM snapshot_apparitions
+                    WHERE title_id IN ({self._placeholders(*title_ids)})
+                """,
+                *title_ids,
+            )
+
+            return [
+                self._from_row(r, self._view_by_name["snapshot_apparitions"])
+                for r in rows
+            ]
+
+    @classmethod
+    def _from_embeddings_row(cls, r):
+        [embeds_table] = [t for t in cls.tables if t.name == "embeddings"]
+        d = cls._from_row(r, embeds_table)
+        d.update(vector=np.frombuffer(d["vector"], dtype="float32"))
+
+        return d
+
+    async def add_embedding(self, title_id: int, embedding):
+        async with self.backend.get_connection() as conn:
+            await conn.execute_insert(
+                self._insert_stmt(
+                    "embeddings",
+                    ["title_id", "vector"],
+                ),
+                title_id,
+                embedding,
+            )
+
+    async def list_sites(self):
+        async with self.backend.get_connection() as conn:
+            sites = await conn.execute_fetchall("SELECT * FROM sites")
+            return [self._from_row(s, self._table_by_name["sites"]) for s in sites]
+
     async def add_page(self, collection, page, dt):
         assert dt.tzinfo is not None
 
@@ -440,23 +441,23 @@ class Storage(StorageAbc):
                 snapshot_id = await self._add_snapshot(
                     conn, site_id, page.snapshot.id, dt
                 )
-                article_id = await self._add_featured_article(
+                article_id = await self._add_article(
                     conn, page.main_article.article.original
                 )
-                main_article_snap_id = await self._add_featured_article_snapshot(
-                    conn, article_id, page.main_article.article
+                title_id = await self._add_title(conn, page.main_article.article.title)
+                await self._add_main_article(
+                    conn,
+                    snapshot_id,
+                    article_id,
+                    title_id,
+                    page.main_article.article.url,
                 )
-                await self._add_main_article(conn, snapshot_id, main_article_snap_id)
 
                 for t in page.top_articles:
-                    article_id = await self._add_featured_article(
-                        conn, t.article.original
-                    )
-                    top_article_snap_id = await self._add_featured_article_snapshot(
-                        conn, article_id, t.article
-                    )
+                    article_id = await self._add_article(conn, t.article.original)
+                    title_id = await self._add_title(conn, t.article.title)
                     await self._add_top_article(
-                        conn, snapshot_id, top_article_snap_id, t
+                        conn, snapshot_id, article_id, title_id, t.article.url, t.rank
                     )
 
         return site_id
@@ -490,49 +491,56 @@ class Storage(StorageAbc):
             [virtual, site_id],
         )
 
-    async def _add_featured_article(self, conn, article: FeaturedArticle):
+    async def _add_article(self, conn, article: FeaturedArticle):
         return await self._insert_or_get(
             conn,
-            self._insert_stmt("featured_articles", ["url"]),
+            self._insert_stmt("articles", ["url"]),
             [str(article.url)],
-            "SELECT id FROM featured_articles WHERE url = $1",
+            "SELECT id FROM articles WHERE url = $1",
             [str(article.url)],
         )
 
-    async def _add_featured_article_snapshot(
-        self, conn, featured_article_id: int, article: FeaturedArticleSnapshot
-    ):
+    async def _add_title(self, conn, title: str):
         return await self._insert_or_get(
             conn,
-            self._insert_stmt(
-                "featured_article_snapshots",
-                ["title", "url", "featured_article_id"],
-            ),
-            [article.title, str(article.url), featured_article_id],
-            "SELECT id FROM featured_article_snapshots WHERE featured_article_id = $1 AND url = $2",
-            [featured_article_id, str(article.url)],
+            self._insert_stmt("titles", ["text"]),
+            [title],
+            "SELECT id FROM titles WHERE text = $1",
+            [title],
         )
 
-    async def _add_main_article(self, conn, snapshot_id: int, article_id: int):
+    async def _add_main_article(
+        self, conn, snapshot_id: int, article_id: int, title_id: int, url: str
+    ):
         await conn.execute_insert(
             self._insert_stmt(
-                "main_articles", ["snapshot_id", "featured_article_snapshot_id"]
+                "main_articles", ["snapshot_id", "article_id", "title_id", "url"]
             ),
             snapshot_id,
             article_id,
+            title_id,
+            str(url),
         )
 
     async def _add_top_article(
-        self, conn, snapshot_id: int, article_id: int, article: TopArticle
+        self,
+        conn,
+        snapshot_id: int,
+        article_id: int,
+        title_id: int,
+        url: str,
+        rank: int,
     ):
         await conn.execute_insert(
             self._insert_stmt(
                 "top_articles",
-                ["snapshot_id", "featured_article_snapshot_id", "rank"],
+                ["snapshot_id", "article_id", "title_id", "url", "rank"],
             ),
             snapshot_id,
             article_id,
-            article.rank,
+            title_id,
+            str(url),
+            rank,
         )
 
     async def _insert_or_get(

+ 2 - 2
src/media_observer/storage_abstraction.py

@@ -86,7 +86,7 @@ class StorageAbc(ABC):
     async def list_all_featured_article_snapshots(self):
         raise NotImplementedError()
 
-    async def list_snapshot_apparitions(self, featured_article_snapshot_ids: list[int]):
+    async def list_snapshot_apparitions(self, title_ids: list[int]):
         raise NotImplementedError()
 
     async def list_all_embedded_featured_article_snapshot_ids(self) -> list[int]:
@@ -95,7 +95,7 @@ class StorageAbc(ABC):
     async def list_all_articles_embeddings(self):
         raise NotImplementedError()
 
-    async def add_embedding(self, featured_article_snapshot_id: int, embedding):
+    async def add_embedding(self, title_id: int, embedding):
         raise NotImplementedError()
 
     async def list_sites(self):

+ 5 - 4
src/media_observer/web.py

@@ -105,6 +105,7 @@ async def site_main_article_snapshot(
         )
 
     main_articles = await storage.list_neighbouring_main_articles(id, timestamp)
+
     [focused_article] = [
         a for a in main_articles if a["site_id"] == id and a["time_diff"] == 0
     ]
@@ -116,10 +117,10 @@ async def site_main_article_snapshot(
         a for a in main_articles if a["site_id"] == id and a["time_diff"] != 0
     ]
 
-    focused_article_id = focused_article["featured_article_snapshot_id"]
+    focused_title_id = focused_article["title_id"]
     try:
         [(_, similar)] = await sim_index.search(
-            [focused_article_id],
+            [focused_title_id],
             20,
             lambda s: s < 100 and s >= 25,
         )
@@ -133,9 +134,9 @@ async def site_main_article_snapshot(
     # A list of articles and score, sorted by descending score
     similar_articles_and_score = sorted(
         [
-            (a, similar_by_id[a["featured_article_snapshot_id"]])
+            (a, similar_by_id[a["title_id"]])
             for a in similar_articles
-            if a["featured_article_snapshot_id"] != focused_article_id
+            if a["title_id"] != focused_title_id
         ],
         key=lambda a: a[1],
         reverse=True,