diff --git a/src-rust/rsc/html_filter_rules/itsfoss.com.toml b/src-rust/rsc/html_filter_rules/itsfoss.com.toml deleted file mode 100644 index 5fff284..0000000 --- a/src-rust/rsc/html_filter_rules/itsfoss.com.toml +++ /dev/null @@ -1,10 +0,0 @@ -tags = [ - "script", "style", "link", "meta", "li", "desc", "title", "svg", "path", "dialog", "select", "head", "header", - "foot", "footer", "ul", "nav", "button", "form", "input", "picture", "time", "h2", "h3", "h4", "i", "aside", - "FreeStarVideoAdContainer", "freestar-video-parent", "reestar-video-child" -] - -classes = [ - "progress-bar", "js-menu", "social-share", "post-info__readtime", "cta__description", "cta__inner", "cta__content", - "hide-mobile", "js-toc", "author-card", "related-posts" -] diff --git a/src-rust/toolkit-core/src/models/html_filter_rule.rs b/src-rust/toolkit-core/src/models/html_filter_rule.rs index 00927b0..3514b21 100644 --- a/src-rust/toolkit-core/src/models/html_filter_rule.rs +++ b/src-rust/toolkit-core/src/models/html_filter_rule.rs @@ -1,14 +1,34 @@ -#[derive(PartialEq, Eq, Debug, serde::Deserialize)] -pub struct HTMLFilterRule { - pub tags: Vec, - pub classes: Vec, +pub struct HTMLFilterRule<'a> { + pub tags: Vec<&'a str>, + pub classes: Vec<&'a str>, } -impl HTMLFilterRule { - pub fn new(tags: Vec, classes: Vec) -> Self { +impl<'a> HTMLFilterRule<'a> { + fn new(tags: Vec<&'a str>, classes: Vec<&'a str>) -> Self { Self { tags, classes, } } + + pub fn get_filter_rule(url: &str) -> Self { + match url { + "itsfoss.com" | "news.itsfoss.com" => { + Self::new( + vec![ + "script", "style", "link", "meta", "li", "desc", "title", "svg", "path", + "dialog", "select", "head", "header", "foot", "footer", "ul", "nav", "button", + "form", "input", "picture", "time", "h2", "h3", "h4", "i", "aside", + "FreeStarVideoAdContainer", "freestar-video-parent", "reestar-video-child", + ], + vec![ + "progress-bar", "js-menu", "social-share", "post-info__readtime", + "cta__description", "cta__inner", "cta__content", "hide-mobile", "js-toc", + "author-card", "related-posts", + ], + ) + } + _ => Self::new(vec![], vec![]) + } + } } diff --git a/src-rust/toolkit-core/src/workflow/translate/select.rs b/src-rust/toolkit-core/src/workflow/translate/select.rs index fbd1506..b9eb796 100644 --- a/src-rust/toolkit-core/src/workflow/translate/select.rs +++ b/src-rust/toolkit-core/src/workflow/translate/select.rs @@ -23,32 +23,13 @@ pub fn get_content(url: &str) -> Result { host.unwrap() }; - let html_filter_rule_path = format!("rsc/html_filter_rules/{}.toml", host); - let html_filter_rule_str = std::fs::read_to_string(html_filter_rule_path); - let html_filter_rule = match html_filter_rule_str { - Ok(html_filter_rule_str) => { - let html_filter_rule: Result = toml::from_str(&html_filter_rule_str); - match html_filter_rule { - Ok(html_filter_rule) => html_filter_rule, - Err(_) => { - let error_msg = format!( - "Failed to parse the HTML filter rule for the website: {}", host - ); - return Err(error_msg); - } - } - }, - Err(_) => { - // Use the default HTML filter rule (no tags and classes to filter) - HTMLFilterRule::new(Vec::new(), Vec::new()) - } - }; + let html_filter_rule = HTMLFilterRule::get_filter_rule(host); // Filter the HTML content let filtered_html = libhtmlfilter::get_filtered_html_fullurl_removeref( url, - html_filter_rule.tags.iter().map(|s| s.as_str()).collect::>().as_slice(), - html_filter_rule.classes.iter().map(|s| s.as_str()).collect::>().as_slice() + &*html_filter_rule.tags, + &*html_filter_rule.classes ); // Parse HTML to markdown