1414from typing import Any
1515from urllib .parse import urlparse
1616
17- from libzim .writer import Hint
17+ from libzim .writer import Hint # pyright: ignore[reportMissingModuleSource]
1818from pydantic import BaseModel
1919from schedule import every , run_pending
2020from zimscraperlib .download import save_large_file
@@ -911,22 +911,22 @@ def _write_tiles_to_zim(
911911 ):
912912 """Write all tiles and tile deduplication files in a single pass.
913913
914- Iterates through tiles_shallow, writing each unique dedup tile data
915- to ZIM and creating redirects from tile paths to dedup paths .
914+ Iterates through tiles_shallow, writing each unique tile data to ZIM and
915+ creating aliases when multiple tiles share the same underlying data .
916916
917917 Args:
918918 creator: ZIM creator object
919919 tile_filter: Optional TileFilter for geographic filtering
920920 total_tile_count: Total number of tiles in tiles_shallow
921921 """
922- logger .info (" Processing tiles and dedup files " )
922+ logger .info (" Processing tiles" )
923923
924924 mbtiles_path = context .dl_folder / f"{ context .area } .mbtiles"
925925 conn = sqlite3 .connect (mbtiles_path )
926926 c = conn .cursor ()
927927
928928 try :
929- written_dedup_ids : set [int ] = set ()
929+ tile_data_id_to_path : dict [int , str ] = {}
930930 written_tiles : int = 0
931931 last_log_time = time .time ()
932932
@@ -939,7 +939,7 @@ def _write_tiles_to_zim(
939939 z = row [0 ]
940940 x = row [1 ]
941941 y = self ._flip_y (z , row [2 ])
942- dedup_id = row [3 ]
942+ tile_data_id = row [3 ]
943943
944944 # Update progress (at the beginning for adequate values)
945945 self .stats_items_done += 1
@@ -950,22 +950,23 @@ def _write_tiles_to_zim(
950950 # Log progress if more than 1 minute since last log
951951 continue
952952
953- # Construct paths
953+ # Construct path
954954 tile_path = f"tiles/{ z } /{ x } /{ y } .pbf"
955- dedupl_path = f"dedupl/{ self ._dedupl_helper_path (dedup_id )} "
956955
957- # Write dedup file if this is the first time we see this dedup_id
958- if dedup_id not in written_dedup_ids :
959- written_dedup_ids . add ( dedup_id )
956+ # Write dedup file if this is the first time we see this tile_data_id
957+ if tile_data_id not in tile_data_id_to_path :
958+ tile_data_id_to_path [ tile_data_id ] = tile_path
960959
961- # Fetch tile data for this dedup_id
960+ # Fetch tile data for this tile_data_id
962961 row_data = conn .execute (
963962 "select tile_data from tiles_data where tile_data_id = ?" ,
964- (dedup_id ,),
963+ (tile_data_id ,),
965964 ).fetchone ()
966965
967966 if not row_data :
968- raise ValueError (f"Tile data not found for dedup_id={ dedup_id } " )
967+ raise ValueError (
968+ f"Tile data not found for tile_data_id={ tile_data_id } "
969+ )
969970
970971 tile_data = row_data [0 ]
971972
@@ -976,18 +977,24 @@ def _write_tiles_to_zim(
976977 # If decompression fails, assume data is already uncompressed
977978 pass
978979
979- # Add dedup file to ZIM
980+ # Add real data to ZIM
980981 creator .add_item_for (
981- path = f"dedupl/ { self . _dedupl_helper_path ( dedup_id ) } " ,
982+ path = tile_path ,
982983 content = tile_data ,
983984 mimetype = "application/x-protobuf" ,
984985 should_compress = True ,
986+ is_front = False ,
987+ )
988+
989+ else :
990+ # Create alias from new tile to original tile already written
991+ creator .add_alias (
992+ tile_path ,
993+ "" ,
994+ tile_data_id_to_path [tile_data_id ],
995+ hints = {Hint .FRONT_ARTICLE : False },
985996 )
986997
987- # Create alias from tile to dedupl
988- creator .add_alias (
989- tile_path , "" , dedupl_path , hints = {Hint .FRONT_ARTICLE : False }
990- )
991998 written_tiles += 1
992999
9931000 # Log progress every LOG_EVERY_SECONDS
@@ -996,15 +1003,17 @@ def _write_tiles_to_zim(
9961003 logger .info (
9971004 f" Processed { i } /{ total_tile_count } tiles "
9981005 f"({ i / total_tile_count * 100 :.1f} % processed: "
999- f"{ written_tiles } tiles and { len (written_dedup_ids )} "
1000- "unique dedup written)"
1006+ f"{ written_tiles } tiles written in the ZIM "
1007+ f"({ len (tile_data_id_to_path )} real data, "
1008+ f"{ written_tiles - len (tile_data_id_to_path )} aliases)"
10011009 )
10021010 last_log_time = current_time
10031011
10041012 logger .info (
10051013 f" Processing complete: { total_tile_count } tiles processed, "
1006- f"{ written_tiles } tiles and { len (written_dedup_ids )} unique dedup "
1007- "written in the ZIM"
1014+ f"{ written_tiles } tiles written in the ZIM "
1015+ f"({ len (tile_data_id_to_path )} real data, "
1016+ f"{ written_tiles - len (tile_data_id_to_path )} aliases)"
10081017 )
10091018
10101019 finally :
@@ -1099,19 +1108,6 @@ def _fetch_mbtiles(self):
10991108 )
11001109 logger .info (f" mbtiles file saved to { mbtiles_path } " )
11011110
1102- @staticmethod
1103- def _dedupl_helper_path (dedupl_id : int ) -> str :
1104- """Calculate dedupl path for a given ID.
1105-
1106- Organizes IDs into a 3-level directory structure to keep max
1107- 1000 items per directory, allowing for 1 billion files.
1108- """
1109- str_num = f"{ dedupl_id :09d} "
1110- l1 = str_num [:3 ]
1111- l2 = str_num [3 :6 ]
1112- l3 = str_num [6 :]
1113- return f"{ l1 } /{ l2 } /{ l3 } .pbf"
1114-
11151111 @staticmethod
11161112 def _flip_y (zoom : int , y : int ) -> int :
11171113 """Flip Y coordinate for tile indexing.
0 commit comments