diff --git a/main.py b/main.py index 16216d0..16ea62c 100644 --- a/main.py +++ b/main.py @@ -108,6 +108,11 @@ def api_collection_book_chapters(collection_name, bookNumber): book_id = Book.get_id_from_number(bookNumber) return Chapter.query.filter_by(collection=collection_name, arabicBookID=book_id).order_by(Chapter.babID) +@app.route('/v1/collections//books//chapters/', methods=['GET']) +def api_collection_book_chapter(collection_name, bookNumber, chapterId): + book_id = Book.get_id_from_number(bookNumber) + chapter = Chapter.query.filter_by(collection=collection_name, arabicBookID=book_id, babID=chapterId).first_or_404() + return jsonify(chapter.serialize()) if __name__ == '__main__': app.run(host='0.0.0.0') diff --git a/models.py b/models.py index b3c746c..626c608 100644 --- a/models.py +++ b/models.py @@ -1,6 +1,6 @@ from flask_sqlalchemy import SQLAlchemy from main import app -from text_transform import cleanup_text, cleanup_en_text +from text_transform import cleanup_text, cleanup_en_text, cleanup_chapter_title db = SQLAlchemy(app) db.reflect() @@ -99,9 +99,9 @@ def serialize(self): { 'lang': 'ar', 'chapterNumber': str(self.arabicBabNumber), - 'chapterTitle': self.arabicBabName, - 'intro': self.arabicIntro, - 'ending': self.arabicEnding + 'chapterTitle': cleanup_chapter_title(self.arabicBabName), + 'intro': cleanup_text(self.arabicIntro), + 'ending': cleanup_text(self.arabicEnding) } ] } @@ -128,7 +128,7 @@ def serialize(self): { 'lang': 'ar', 'chapterNumber': self.arabicBabNumber, - 'chapterTitle': self.arabicBabName, + 'chapterTitle': cleanup_chapter_title(self.arabicBabName), 'urn': self.arabicURN, 'body': cleanup_text(self.arabicText), 'grade': self.arabicgrade1, diff --git a/text_transform.py b/text_transform.py index e618384..d5cf9f6 100644 --- a/text_transform.py +++ b/text_transform.py @@ -2,7 +2,7 @@ import lxml.html import lxml -def fix_html(text): +def fix_html(text, remove_wrapper=False): """Fix invalid html, remove unnecessary attribs, tags and whitespace""" text = text.strip() text = text.replace('\r', '') # remove \r as lxml escapes it @@ -19,6 +19,9 @@ def fix_html(text): continue children.append(lxml.etree.tostring(elem, encoding='unicode')) text = '\n'.join(children) + if remove_wrapper: + text = re.sub(r'^

', '', text) + text = re.sub(r'

$', '', text) text = re.sub(r'', '', text) # remove like tags return text @@ -60,6 +63,8 @@ def fix_hyperlinks(text): return text def cleanup_text(text): + if not text: + return text text = re.sub(r'\n+', '\n', text) text = re.sub(r' +', ' ', text) text = fix_html(text) @@ -73,3 +78,13 @@ def cleanup_en_text(text): text = cleanup_text(text) text = standardize_terms(text) return text + +def cleanup_chapter_title(text): + if not text: + return text + text = re.sub(r'\n+', '\n', text) + text = re.sub(r' +', ' ', text) + text = fix_html(text, remove_wrapper=True) + text = fix_hyperlinks(text) + text = text.strip() + return text