Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Python modules:
opencc (https://pypi.org/project/OpenCC/)
pypinyin (https://pypi.org/project/pypinyin/)
regex (https://pypi.org/project/regex/)

tenacity (https://pypi.org/project/tenacity/)

Manual Build & Installation:

Expand Down
55 changes: 55 additions & 0 deletions mediawiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import requests
import time
from datetime import datetime
from email.utils import parsedate_to_datetime
from tenacity import retry, retry_if_result, stop_after_attempt

def is_none_p(value):
"""Return True if value is None"""
return value is None

def get_retry_after_delay(response: requests.Response) -> int:
"""
Parses the Retry-After header from an HTTP response and returns the
delay in seconds.
"""
retry_after = response.headers.get("Retry-After")
if not retry_after:
return 0

try:
# Attempt to parse as an integer (delay in seconds)
delay_seconds = int(retry_after)
return delay_seconds
except ValueError:
# If not an integer, attempt to parse as an HTTP date
try:
retry_date = parsedate_to_datetime(retry_after)
now = datetime.now(retry_date.tzinfo)
delay_seconds = (retry_date - now).total_seconds()
return max(0, int(delay_seconds))
except (ValueError, TypeError):
raise

def init_session():
# https://wikitech.wikimedia.org/wiki/Robot_policy
global session
session = requests.Session()
headers = {
"User-Agent": f"User-Agent: FcitxZhwikiDictBot/1.0 (https://github.com/felixonmars/fcitx5-pinyin-zhwiki) python-requests/{requests.__version__}",
"Accept-Encoding": "gzip, deflate, br, zstd",
}
session.headers.update(headers)

@retry(stop=stop_after_attempt(10), retry=retry_if_result(is_none_p))
def do_request(url: str, params=None) -> requests.Response:
r = session.get(url, params=params)
if r.status_code == 200:
return r
elif r.status_code == 429:
delay = get_retry_after_delay(r)
time.sleep(delay)
return None
else:
r.raise_for_status()
return None
15 changes: 8 additions & 7 deletions zhwiki-web-slang.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import urllib.parse
import urllib.request
import collections
import sys

from mediawiki import init_session, do_request

def fetch():
_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
# https://www.mediawiki.org/wiki/API:REST_API/Reference#Get_page_source
_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/rest.php/v1/page/"
_PAGE = "中国大陆网络用语列表"

page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
wikitext = json.loads(page)["parse"]["wikitext"]
return wikitext
url = _ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)
init_session()
r = do_request(url)
page = r.json()
return page["source"]


def trim_templates(wikitext):
Expand Down