Skip to content

Commit 0119431

Browse files
authored
Merge pull request #82 from openzim/remove_dedup
Reduce number of alias for tiles to a strict minimum
2 parents ae3866d + 517c2ef commit 0119431

2 files changed

Lines changed: 34 additions & 37 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2424
- Fix bad favicon paths (#68)
2525
- Replace axios with fetch API and move config.json to a relative URL (#75)
2626
- Create ZIM alias instead of redirects for tiles (#53)
27+
- Reduce number of alias for tiles to a strict minimum (#78)
2728

2829
## [0.1.1] - 2026-03-10
2930

scraper/src/maps2zim/processor.py

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from typing import Any
1515
from urllib.parse import urlparse
1616

17-
from libzim.writer import Hint
17+
from libzim.writer import Hint # pyright: ignore[reportMissingModuleSource]
1818
from pydantic import BaseModel
1919
from schedule import every, run_pending
2020
from zimscraperlib.download import save_large_file
@@ -911,22 +911,22 @@ def _write_tiles_to_zim(
911911
):
912912
"""Write all tiles and tile deduplication files in a single pass.
913913
914-
Iterates through tiles_shallow, writing each unique dedup tile data
915-
to ZIM and creating redirects from tile paths to dedup paths.
914+
Iterates through tiles_shallow, writing each unique tile data to ZIM and
915+
creating aliases when multiple tiles share the same underlying data.
916916
917917
Args:
918918
creator: ZIM creator object
919919
tile_filter: Optional TileFilter for geographic filtering
920920
total_tile_count: Total number of tiles in tiles_shallow
921921
"""
922-
logger.info(" Processing tiles and dedup files")
922+
logger.info(" Processing tiles")
923923

924924
mbtiles_path = context.dl_folder / f"{context.area}.mbtiles"
925925
conn = sqlite3.connect(mbtiles_path)
926926
c = conn.cursor()
927927

928928
try:
929-
written_dedup_ids: set[int] = set()
929+
tile_data_id_to_path: dict[int, str] = {}
930930
written_tiles: int = 0
931931
last_log_time = time.time()
932932

@@ -939,7 +939,7 @@ def _write_tiles_to_zim(
939939
z = row[0]
940940
x = row[1]
941941
y = self._flip_y(z, row[2])
942-
dedup_id = row[3]
942+
tile_data_id = row[3]
943943

944944
# Update progress (at the beginning for adequate values)
945945
self.stats_items_done += 1
@@ -950,22 +950,23 @@ def _write_tiles_to_zim(
950950
# Log progress if more than 1 minute since last log
951951
continue
952952

953-
# Construct paths
953+
# Construct path
954954
tile_path = f"tiles/{z}/{x}/{y}.pbf"
955-
dedupl_path = f"dedupl/{self._dedupl_helper_path(dedup_id)}"
956955

957-
# Write dedup file if this is the first time we see this dedup_id
958-
if dedup_id not in written_dedup_ids:
959-
written_dedup_ids.add(dedup_id)
956+
# Write dedup file if this is the first time we see this tile_data_id
957+
if tile_data_id not in tile_data_id_to_path:
958+
tile_data_id_to_path[tile_data_id] = tile_path
960959

961-
# Fetch tile data for this dedup_id
960+
# Fetch tile data for this tile_data_id
962961
row_data = conn.execute(
963962
"select tile_data from tiles_data where tile_data_id = ?",
964-
(dedup_id,),
963+
(tile_data_id,),
965964
).fetchone()
966965

967966
if not row_data:
968-
raise ValueError(f"Tile data not found for dedup_id={dedup_id}")
967+
raise ValueError(
968+
f"Tile data not found for tile_data_id={tile_data_id}"
969+
)
969970

970971
tile_data = row_data[0]
971972

@@ -976,18 +977,24 @@ def _write_tiles_to_zim(
976977
# If decompression fails, assume data is already uncompressed
977978
pass
978979

979-
# Add dedup file to ZIM
980+
# Add real data to ZIM
980981
creator.add_item_for(
981-
path=f"dedupl/{self._dedupl_helper_path(dedup_id)}",
982+
path=tile_path,
982983
content=tile_data,
983984
mimetype="application/x-protobuf",
984985
should_compress=True,
986+
is_front=False,
987+
)
988+
989+
else:
990+
# Create alias from new tile to original tile already written
991+
creator.add_alias(
992+
tile_path,
993+
"",
994+
tile_data_id_to_path[tile_data_id],
995+
hints={Hint.FRONT_ARTICLE: False},
985996
)
986997

987-
# Create alias from tile to dedupl
988-
creator.add_alias(
989-
tile_path, "", dedupl_path, hints={Hint.FRONT_ARTICLE: False}
990-
)
991998
written_tiles += 1
992999

9931000
# Log progress every LOG_EVERY_SECONDS
@@ -996,15 +1003,17 @@ def _write_tiles_to_zim(
9961003
logger.info(
9971004
f" Processed {i}/{total_tile_count} tiles "
9981005
f"({i / total_tile_count * 100:.1f}% processed: "
999-
f"{written_tiles} tiles and {len(written_dedup_ids)} "
1000-
"unique dedup written)"
1006+
f"{written_tiles} tiles written in the ZIM "
1007+
f"({len(tile_data_id_to_path)} real data, "
1008+
f"{written_tiles - len(tile_data_id_to_path)} aliases)"
10011009
)
10021010
last_log_time = current_time
10031011

10041012
logger.info(
10051013
f" Processing complete: {total_tile_count} tiles processed, "
1006-
f"{written_tiles} tiles and {len(written_dedup_ids)} unique dedup "
1007-
"written in the ZIM"
1014+
f"{written_tiles} tiles written in the ZIM "
1015+
f"({len(tile_data_id_to_path)} real data, "
1016+
f"{written_tiles - len(tile_data_id_to_path)} aliases)"
10081017
)
10091018

10101019
finally:
@@ -1099,19 +1108,6 @@ def _fetch_mbtiles(self):
10991108
)
11001109
logger.info(f" mbtiles file saved to {mbtiles_path}")
11011110

1102-
@staticmethod
1103-
def _dedupl_helper_path(dedupl_id: int) -> str:
1104-
"""Calculate dedupl path for a given ID.
1105-
1106-
Organizes IDs into a 3-level directory structure to keep max
1107-
1000 items per directory, allowing for 1 billion files.
1108-
"""
1109-
str_num = f"{dedupl_id:09d}"
1110-
l1 = str_num[:3]
1111-
l2 = str_num[3:6]
1112-
l3 = str_num[6:]
1113-
return f"{l1}/{l2}/{l3}.pbf"
1114-
11151111
@staticmethod
11161112
def _flip_y(zoom: int, y: int) -> int:
11171113
"""Flip Y coordinate for tile indexing.

0 commit comments

Comments
 (0)