Przeglądaj źródła

[fix] URLs now use the virtual_timestamp instead of an ID

The virtual_timestamp was assumed to be unique by snapshot but it is not
the case.
jherve 1 rok temu
rodzic
commit
70d9705643

+ 23 - 1
README.md

@@ -16,4 +16,26 @@ SELECT * FROM (
 WHERE count > 1
 WHERE count > 1
 ```
 ```
 
 
-Among other things it leads to "deadends" while browsing the UI, likely because the timestamp search and time diff relies on this false assumption.
+Among other things it leads to "deadends" while browsing the UI, likely because the timestamp search and time diff relies on this false assumption.
+
+2024-05-23 : This is likely not relevant anymore now that the URLs include the timestamp and not the snapshot_id.
+
+### Different virtual timestamp, same timestamp
+
+The snapshot process ends up choosing the same snapshot for different virtual timestamps.
+
+This can be checked with this query :
+
+```sql
+SELECT
+    sv.id, sv.site_id, sv2.id, sv2.site_id, sv.timestamp_virtual, sv2.timestamp_virtual, sv2.timestamp
+FROM snapshots_view sv
+CROSS JOIN snapshots_view sv2
+WHERE
+    sv.id != sv2.id
+    and sv.timestamp = sv2.timestamp
+```
+
+### Web archive URL
+
+In "Le Parisien" article snapshots, the archive url is not good, e.g. : //web.archive.org/web/20240522165852/https://www.leparisien.fr/jo-paris-2024/jo-2024-jusqua-1-900-euros-pour-les-cheminots-22-05-2024-SPE5SAOSZVAL3LE3KCMHP2ZBDE.php

+ 11 - 20
src/de_quoi_parle_le_monde/storage.py

@@ -436,29 +436,21 @@ class Storage:
     async def list_neighbouring_main_articles(
     async def list_neighbouring_main_articles(
         self,
         self,
         site_id: int,
         site_id: int,
-        featured_article_snapshot_id: int | None = None,
+        timestamp: datetime | None = None,
     ):
     ):
         async with self.backend.get_connection() as conn:
         async with self.backend.get_connection() as conn:
-            if featured_article_snapshot_id is None:
-                timestamp_query, timestamp_params = (
+            if timestamp is None:
+                [row] = await conn.execute_fetchall(
                     """
                     """
                     SELECT timestamp_virtual
                     SELECT timestamp_virtual
-                    FROM snapshot_apparitions sav
-                    WHERE is_main AND site_id = $1
+                    FROM snapshots_view
+                    WHERE site_id = $1
                     ORDER BY timestamp_virtual DESC
                     ORDER BY timestamp_virtual DESC
                     LIMIT 1
                     LIMIT 1
                     """,
                     """,
-                    [site_id],
-                )
-            else:
-                timestamp_query, timestamp_params = (
-                    """
-                    SELECT timestamp_virtual
-                    FROM snapshot_apparitions sav
-                    WHERE is_main AND site_id = $1 AND featured_article_snapshot_id = $2
-                    """,
-                    [site_id, featured_article_snapshot_id],
+                    site_id,
                 )
                 )
+                timestamp = row["timestamp_virtual"]
 
 
             # This query is the union of 3 queries that respectively fetch :
             # This query is the union of 3 queries that respectively fetch :
             #   * articles published at the same time as the queried article (including the queried article)
             #   * articles published at the same time as the queried article (including the queried article)
@@ -466,10 +458,8 @@ class Storage:
             #   *the article published just before, on the same site
             #   *the article published just before, on the same site
             main_articles = await conn.execute_fetchall(
             main_articles = await conn.execute_fetchall(
                 f"""
                 f"""
-                WITH original_timestamp AS (
-                    {timestamp_query}
-                ), sav_diff AS (
-                    SELECT sav.*, EXTRACT(EPOCH FROM sav.timestamp_virtual - (SELECT * FROM original_timestamp)) :: integer AS time_diff
+                WITH sav_diff AS (
+                    SELECT sav.*, EXTRACT(EPOCH FROM sav.timestamp_virtual - $2) :: integer AS time_diff
                     FROM snapshot_apparitions sav
                     FROM snapshot_apparitions sav
                 )
                 )
                 SELECT * FROM (
                 SELECT * FROM (
@@ -491,7 +481,8 @@ class Storage:
                     LIMIT 1
                     LIMIT 1
                 )
                 )
                 """,
                 """,
-                *(timestamp_params),
+                site_id,
+                timestamp,
             )
             )
 
 
             return [
             return [

+ 5 - 5
src/de_quoi_parle_le_monde/web.py

@@ -1,4 +1,4 @@
-from datetime import timedelta
+from datetime import datetime, timedelta
 from fastapi import FastAPI, Request, Depends
 from fastapi import FastAPI, Request, Depends
 from fastapi.responses import HTMLResponse
 from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.staticfiles import StaticFiles
@@ -78,12 +78,12 @@ async def index(request: Request, storage: Storage = Depends(get_db)):
     )
     )
 
 
 
 
-@app.get("/sites/{id}/main_article", response_class=HTMLResponse)
-@app.get("/sites/{id}/main_article/{snapshot_id}", response_class=HTMLResponse)
+@app.get("/t/sites/{id}/main_article", response_class=HTMLResponse)
+@app.get("/t/sites/{id}/main_article/{timestamp}", response_class=HTMLResponse)
 async def site_main_article_snapshot(
 async def site_main_article_snapshot(
     request: Request,
     request: Request,
     id: int,
     id: int,
-    snapshot_id: int | None = None,
+    timestamp: datetime | None = None,
     storage: Storage = Depends(get_db),
     storage: Storage = Depends(get_db),
     sim_index: SimilaritySearch = Depends(get_similarity_search),
     sim_index: SimilaritySearch = Depends(get_similarity_search),
 ):
 ):
@@ -94,7 +94,7 @@ async def site_main_article_snapshot(
             default=None,
             default=None,
         )
         )
 
 
-    main_articles = await storage.list_neighbouring_main_articles(id, snapshot_id)
+    main_articles = await storage.list_neighbouring_main_articles(id, timestamp)
     [focused_article] = [
     [focused_article] = [
         a for a in main_articles if a["site_id"] == id and a["time_diff"] == 0
         a for a in main_articles if a["site_id"] == id and a["time_diff"] == 0
     ]
     ]

+ 1 - 1
templates/site_main_article_detail.html

@@ -8,7 +8,7 @@
 </head>
 </head>
 <body>
 <body>
     {% macro article(a) -%}
     {% macro article(a) -%}
-        <a href="{{ url_for('site_main_article_snapshot', id=a['site_id'], snapshot_id=a['featured_article_snapshot_id']) }}">
+        <a href="{{ url_for('site_main_article_snapshot', id=a['site_id'], timestamp=a['timestamp_virtual']) }}">
             {{ ui.logo(a["site_name"]) }} {{ a["title"] }}
             {{ ui.logo(a["site_name"]) }} {{ a["title"] }}
         </a>
         </a>
     {%- endmacro %}
     {%- endmacro %}