Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions src-rust/rsc/html_filter_rules/itsfoss.com.toml

This file was deleted.

32 changes: 26 additions & 6 deletions src-rust/toolkit-core/src/models/html_filter_rule.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,34 @@
#[derive(PartialEq, Eq, Debug, serde::Deserialize)]
pub struct HTMLFilterRule {
pub tags: Vec<String>,
pub classes: Vec<String>,
pub struct HTMLFilterRule<'a> {
pub tags: Vec<&'a str>,
pub classes: Vec<&'a str>,
}

impl HTMLFilterRule {
pub fn new(tags: Vec<String>, classes: Vec<String>) -> Self {
impl<'a> HTMLFilterRule<'a> {
fn new(tags: Vec<&'a str>, classes: Vec<&'a str>) -> Self {
Self {
tags,
classes,
}
}

pub fn get_filter_rule(url: &str) -> Self {
match url {
"itsfoss.com" | "news.itsfoss.com" => {
Self::new(
vec![
"script", "style", "link", "meta", "li", "desc", "title", "svg", "path",
"dialog", "select", "head", "header", "foot", "footer", "ul", "nav", "button",
"form", "input", "picture", "time", "h2", "h3", "h4", "i", "aside",
"FreeStarVideoAdContainer", "freestar-video-parent", "reestar-video-child",
],
vec![
"progress-bar", "js-menu", "social-share", "post-info__readtime",
"cta__description", "cta__inner", "cta__content", "hide-mobile", "js-toc",
"author-card", "related-posts",
],
)
}
_ => Self::new(vec![], vec![])
}
}
}
25 changes: 3 additions & 22 deletions src-rust/toolkit-core/src/workflow/translate/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,13 @@ pub fn get_content(url: &str) -> Result<String, String> {
host.unwrap()
};

let html_filter_rule_path = format!("rsc/html_filter_rules/{}.toml", host);
let html_filter_rule_str = std::fs::read_to_string(html_filter_rule_path);
let html_filter_rule = match html_filter_rule_str {
Ok(html_filter_rule_str) => {
let html_filter_rule: Result<HTMLFilterRule, _> = toml::from_str(&html_filter_rule_str);
match html_filter_rule {
Ok(html_filter_rule) => html_filter_rule,
Err(_) => {
let error_msg = format!(
"Failed to parse the HTML filter rule for the website: {}", host
);
return Err(error_msg);
}
}
},
Err(_) => {
// Use the default HTML filter rule (no tags and classes to filter)
HTMLFilterRule::new(Vec::new(), Vec::new())
}
};
let html_filter_rule = HTMLFilterRule::get_filter_rule(host);

// Filter the HTML content
let filtered_html = libhtmlfilter::get_filtered_html_fullurl_removeref(
url,
html_filter_rule.tags.iter().map(|s| s.as_str()).collect::<Vec<&str>>().as_slice(),
html_filter_rule.classes.iter().map(|s| s.as_str()).collect::<Vec<&str>>().as_slice()
&*html_filter_rule.tags,
&*html_filter_rule.classes
);

// Parse HTML to markdown
Expand Down