Browse Source

[fix] URLs now use the virtual_timestamp instead of an ID

The virtual_timestamp was assumed to be unique by snapshot but it is not
the case.
jherve 1 năm trước cách đây
mục cha
commit
70d9705643

+ 23 - 1
README.md

@@ -16,4 +16,26 @@ SELECT * FROM (
 WHERE count > 1
 ```
 
-Among other things it leads to "deadends" while browsing the UI, likely because the timestamp search and time diff relies on this false assumption.
+Among other things it leads to "deadends" while browsing the UI, likely because the timestamp search and time diff relies on this false assumption.
+
+2024-05-23 : This is likely not relevant anymore now that the URLs include the timestamp and not the snapshot_id.
+
+### Different virtual timestamp, same timestamp
+
+The snapshot process ends up choosing the same snapshot for different virtual timestamps.
+
+This can be checked with this query :
+
+```sql
+SELECT
+    sv.id, sv.site_id, sv2.id, sv2.site_id, sv.timestamp_virtual, sv2.timestamp_virtual, sv2.timestamp
+FROM snapshots_view sv
+CROSS JOIN snapshots_view sv2
+WHERE
+    sv.id != sv2.id
+    and sv.timestamp = sv2.timestamp
+```
+
+### Web archive URL
+
+In "Le Parisien" article snapshots, the archive url is not good, e.g. : //web.archive.org/web/20240522165852/https://www.leparisien.fr/jo-paris-2024/jo-2024-jusqua-1-900-euros-pour-les-cheminots-22-05-2024-SPE5SAOSZVAL3LE3KCMHP2ZBDE.php

+ 11 - 20
src/de_quoi_parle_le_monde/storage.py

@@ -436,29 +436,21 @@ class Storage:
     async def list_neighbouring_main_articles(
         self,
         site_id: int,
-        featured_article_snapshot_id: int | None = None,
+        timestamp: datetime | None = None,
     ):
         async with self.backend.get_connection() as conn:
-            if featured_article_snapshot_id is None:
-                timestamp_query, timestamp_params = (
+            if timestamp is None:
+                [row] = await conn.execute_fetchall(
                     """
                     SELECT timestamp_virtual
-                    FROM snapshot_apparitions sav
-                    WHERE is_main AND site_id = $1
+                    FROM snapshots_view
+                    WHERE site_id = $1
                     ORDER BY timestamp_virtual DESC
                     LIMIT 1
                     """,
-                    [site_id],
-                )
-            else:
-                timestamp_query, timestamp_params = (
-                    """
-                    SELECT timestamp_virtual
-                    FROM snapshot_apparitions sav
-                    WHERE is_main AND site_id = $1 AND featured_article_snapshot_id = $2
-                    """,
-                    [site_id, featured_article_snapshot_id],
+                    site_id,
                 )
+                timestamp = row["timestamp_virtual"]
 
             # This query is the union of 3 queries that respectively fetch :
             #   * articles published at the same time as the queried article (including the queried article)
@@ -466,10 +458,8 @@ class Storage:
             #   *the article published just before, on the same site
             main_articles = await conn.execute_fetchall(
                 f"""
-                WITH original_timestamp AS (
-                    {timestamp_query}
-                ), sav_diff AS (
-                    SELECT sav.*, EXTRACT(EPOCH FROM sav.timestamp_virtual - (SELECT * FROM original_timestamp)) :: integer AS time_diff
+                WITH sav_diff AS (
+                    SELECT sav.*, EXTRACT(EPOCH FROM sav.timestamp_virtual - $2) :: integer AS time_diff
                     FROM snapshot_apparitions sav
                 )
                 SELECT * FROM (
@@ -491,7 +481,8 @@ class Storage:
                     LIMIT 1
                 )
                 """,
-                *(timestamp_params),
+                site_id,
+                timestamp,
             )
 
             return [

+ 5 - 5
src/de_quoi_parle_le_monde/web.py

@@ -1,4 +1,4 @@
-from datetime import timedelta
+from datetime import datetime, timedelta
 from fastapi import FastAPI, Request, Depends
 from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
@@ -78,12 +78,12 @@ async def index(request: Request, storage: Storage = Depends(get_db)):
     )
 
 
-@app.get("/sites/{id}/main_article", response_class=HTMLResponse)
-@app.get("/sites/{id}/main_article/{snapshot_id}", response_class=HTMLResponse)
+@app.get("/t/sites/{id}/main_article", response_class=HTMLResponse)
+@app.get("/t/sites/{id}/main_article/{timestamp}", response_class=HTMLResponse)
 async def site_main_article_snapshot(
     request: Request,
     id: int,
-    snapshot_id: int | None = None,
+    timestamp: datetime | None = None,
     storage: Storage = Depends(get_db),
     sim_index: SimilaritySearch = Depends(get_similarity_search),
 ):
@@ -94,7 +94,7 @@ async def site_main_article_snapshot(
             default=None,
         )
 
-    main_articles = await storage.list_neighbouring_main_articles(id, snapshot_id)
+    main_articles = await storage.list_neighbouring_main_articles(id, timestamp)
     [focused_article] = [
         a for a in main_articles if a["site_id"] == id and a["time_diff"] == 0
     ]

+ 1 - 1
templates/site_main_article_detail.html

@@ -8,7 +8,7 @@
 </head>
 <body>
     {% macro article(a) -%}
-        <a href="{{ url_for('site_main_article_snapshot', id=a['site_id'], snapshot_id=a['featured_article_snapshot_id']) }}">
+        <a href="{{ url_for('site_main_article_snapshot', id=a['site_id'], timestamp=a['timestamp_virtual']) }}">
             {{ ui.logo(a["site_name"]) }} {{ a["title"] }}
         </a>
     {%- endmacro %}