Verified Commit f9b34bbd authored by Jordan Petridis's avatar Jordan Petridis 🌱

h-data: Initial implementation of an OPML parser and importer.

This is not really compiant with the OPML spec and there
does not seem to be an OPML crate sadly. There are edge-cases
that are not handled but will only be addressed if a problem is reported.
parent f06dbd05
......@@ -685,6 +685,7 @@ dependencies = [
"hyper-tls 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"maplit 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"native-tls 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"pretty_assertions 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
......@@ -699,6 +700,7 @@ dependencies = [
"tokio-core 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)",
"url 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"xdg 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
......@@ -2141,6 +2143,14 @@ name = "xdg"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "xml-rs"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum adler32 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6cbd0b9af8587c72beadc9f72d35b9fbb070982c9e6203e46e93f10df25f8f45"
"checksum aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "d6531d44de723825aa81398a6415283229725a00fa30713812ab9323faa82fc4"
......@@ -2377,3 +2387,4 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
"checksum ws2_32-sys 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e"
"checksum xdg 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a66b7c2281ebde13cf4391d70d4c7e5946c3c25e72a7b859ca8f677dcd0b0c61"
"checksum xml-rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3c1cb601d29fe2c2ac60a2b2e5e293994d87a1f6fa9687a31a15270f909be9c2"
......@@ -16,6 +16,7 @@ rfc822_sanitizer = "0.3.3"
rss = "1.5.0"
url = "1.7.0"
xdg = "2.1.0"
xml-rs = "0.7.0"
futures = "0.1.21"
hyper = "0.11.25"
tokio-core = "0.1.17"
......@@ -43,6 +44,7 @@ rand = "0.4.2"
tempdir = "0.3.7"
criterion = "0.2.3"
pretty_assertions = "0.5.1"
maplit = "1.0.1"
[[bench]]
name = "bench"
......
......@@ -5,6 +5,7 @@ use hyper;
use native_tls;
use rss;
use url;
use xml;
use std::io;
......@@ -49,6 +50,8 @@ pub enum DataError {
IOError(#[cause] io::Error),
#[fail(display = "RSS Error: {}", _0)]
RssError(#[cause] rss::Error),
#[fail(display = "XML Reader Error: {}", _0)]
XmlReaderError(#[cause] xml::reader::Error),
#[fail(display = "Error: {}", _0)]
Bail(String),
#[fail(display = "{}", _0)]
......@@ -115,6 +118,12 @@ impl From<rss::Error> for DataError {
}
}
impl From<xml::reader::Error> for DataError {
fn from(err: xml::reader::Error) -> Self {
DataError::XmlReaderError(err)
}
}
impl From<String> for DataError {
fn from(err: String) -> Self {
DataError::Bail(err)
......
......@@ -29,6 +29,10 @@
#[macro_use]
extern crate pretty_assertions;
#[cfg(test)]
#[macro_use]
extern crate maplit;
#[macro_use]
extern crate derive_builder;
#[macro_use]
......@@ -58,6 +62,8 @@ extern crate rss;
extern crate tokio_core;
extern crate url;
extern crate xdg;
#[allow(unused)]
extern crate xml;
pub mod database;
#[allow(missing_docs)]
......@@ -66,6 +72,7 @@ pub mod dbqueries;
pub mod errors;
mod feed;
pub(crate) mod models;
pub mod opml;
mod parser;
pub mod pipeline;
mod schema;
......
//! FIXME: Docs
// #![allow(unused)]
use errors::DataError;
use models::Source;
use xml::reader;
use std::collections::HashSet;
use std::io::Read;
// use std::fs::{File, OpenOptions};
// use std::io::BufReader;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
// FIXME: Make it a Diesel model
/// Represents an `outline` xml element as per the `OPML` [specification][spec]
/// not `RSS` related sub-elements are ommited.
///
/// [spec]: http://dev.opml.org/spec2.html
pub struct Opml {
title: String,
description: String,
url: String,
}
/// Import feed url's from a `R` into the `Source` table.
pub fn opml_import<R: Read>(reader: R) -> Result<Vec<Result<Source, DataError>>, DataError> {
let feeds = extract_sources(reader)?;
Ok(feeds
.iter()
.map(|opml| Source::from_url(&opml.url))
.collect())
}
/// Extracts the `outline` elemnts from a reader `R` and returns a `HashSet` of `Opml` structs.
pub fn extract_sources<R: Read>(reader: R) -> Result<HashSet<Opml>, reader::Error> {
let mut list = HashSet::new();
let parser = reader::EventReader::new(reader);
parser
.into_iter()
.map(|e| match e {
Ok(reader::XmlEvent::StartElement {
name, attributes, ..
}) => {
if name.local_name == "outline" {
let mut title = String::new();
let mut url = String::new();
let mut description = String::new();
attributes.into_iter().for_each(|attribute| {
match attribute.name.local_name.as_str() {
"title" => title = attribute.value,
"xmlUrl" => url = attribute.value,
"description" => description = attribute.value,
_ => {}
}
});
let feed = Opml {
title,
description,
url,
};
list.insert(feed);
}
Ok(())
}
Err(err) => Err(err),
_ => Ok(()),
})
.collect::<Result<Vec<_>, reader::Error>>()?;
Ok(list)
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Local;
#[test]
fn test_extract() {
let int_title = String::from("Intercepted with Jeremy Scahill");
let int_url = String::from("https://feeds.feedburner.com/InterceptedWithJeremyScahill");
let int_desc =
String::from(
"The people behind The Intercept’s fearless reporting and incisive \
commentary—Jeremy Scahill, Glenn Greenwald, Betsy Reed and others—discuss the \
crucial issues of our time: national security, civil liberties, foreign policy, \
and criminal justice. Plus interviews with artists, thinkers, and newsmakers \
who challenge our preconceptions about the world we live in.",
);
let dec_title = String::from("Deconstructed with Mehdi Hasan");
let dec_url = String::from("https://rss.prod.firstlook.media/deconstructed/podcast.rss");
let dec_desc = String::from(
"Journalist Mehdi Hasan is known around the world for his televised takedowns of \
presidents and prime ministers. In this new podcast from The Intercept, Mehdi \
unpacks a game-changing news event of the week while challenging the conventional \
wisdom. As a Brit, a Muslim and an immigrant based in Donald Trump's Washington \
D.C., Mehdi gives a refreshingly provocative perspective on the ups and downs of \
American—and global—politics.",
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let sample1 = format!(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?> \
<opml version=\"2.0\"> \
<head> \
<title>Test OPML File</title> \
<dateCreated>{}</dateCreated> \
<docs>http://www.opml.org/spec2</docs> \
</head> \
<body> \
<outline type=\"rss\" title=\"{}\" description=\"{}\" xmlUrl=\"{}\"/> \
<outline type=\"rss\" title=\"{}\" description=\"{}\" xmlUrl=\"{}\"/> \
</body> \
</opml>",
Local::now().format("%a, %d %b %Y %T %Z"),
int_title,
int_desc,
int_url,
dec_title,
dec_desc,
dec_url,
);
let map = hashset![
Opml {
title: int_title,
description: int_desc,
url: int_url
},
Opml {
title: dec_title,
description: dec_desc,
url: dec_url
},
];
assert_eq!(extract_sources(sample1.as_bytes()).unwrap(), map);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment