From 10ba0514e61cd5e65f8e997fc2c42eb403273a4d Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 6 May 2018 12:41:42 +0200 Subject: [PATCH] Change arguments parsing of CSV module --- src/vtab/csvtab.rs | 191 ++++++++++++++++++++++++++++++++------------- src/vtab/mod.rs | 57 ++++++++++++++ 2 files changed, 193 insertions(+), 55 deletions(-) diff --git a/src/vtab/csvtab.rs b/src/vtab/csvtab.rs index d7c6b99..00f823c 100644 --- a/src/vtab/csvtab.rs +++ b/src/vtab/csvtab.rs @@ -1,4 +1,5 @@ //! CSV Virtual Table +//! Port of [csv](http://www.sqlite.org/cgi/src/finfo?name=ext/misc/csv.c) C extension. extern crate csv; use std::fs::File; use std::os::raw::{c_char, c_int, c_void}; @@ -9,9 +10,19 @@ use std::str; use {Connection, Error, Result}; use ffi; use types::Null; -use vtab::{declare_vtab, escape_double_quote, Context, IndexInfo, Values, VTab, VTabCursor}; +use vtab::{declare_vtab, dequote, escape_double_quote, parse_boolean, Context, IndexInfo, Values, VTab, VTabCursor}; /// Register the "csv" module. +/// ```sql +/// CREATE VIRTUAL TABLE vtab USING csv( +/// filename=FILENAME -- Name of file containing CSV content +/// [, schema=SCHEMA] -- Alternative CSV schema. 'CREATE TABLE x(col1 TEXT NOT NULL, col2 INT, ...);' +/// [, header=YES|NO] -- First row of CSV defines the names of columns if "yes". Default "no". +/// [, columns=N] -- Assume the CSV file contains N columns. +/// [, delimiter=C] -- CSV delimiter. Default ','. +/// [, quote=C] -- CSV quote. Default '"'. 0 means no quote. +/// ); +/// ``` pub fn load_module(conn: &Connection) -> Result<()> { let aux: Option<()> = None; conn.create_module("csv", &CSV_MODULE, aux) @@ -55,6 +66,27 @@ impl CSVTab { .quote(self.quote) }) } + + fn parameter(c_slice: &[u8]) -> Result<(&str, &str)> { + let arg = try!(str::from_utf8(c_slice)).trim(); + let mut split = arg.split('='); + if let Some(key) = split.next() { + if let Some(value) = split.next() { + let param = key.trim(); + let value = dequote(value); + return Ok((param, value)); + } + } + Err(Error::ModuleError(format!("illegal argument: '{}'", arg))) + } + + fn parse_byte(arg: &str) -> Option { + if arg.len() == 1 { + arg.bytes().next() + } else { + None + } + } } impl VTab for CSVTab { @@ -62,78 +94,124 @@ impl VTab for CSVTab { if args.len() < 4 { return Err(Error::ModuleError("no CSV file specified".to_owned())); } - // pull out name of csv file (remove quotes) - let mut c_filename = args[3]; - if c_filename[0] == b'\'' { - c_filename = &c_filename[1..c_filename.len() - 1]; - } - let filename = try!(str::from_utf8(c_filename)); - if !Path::new(filename).exists() { - return Err(Error::ModuleError(format!("file '{}' does not exist", filename))); - } + let mut vtab = CSVTab { base: Default::default(), - filename: String::from(filename), + filename: "".to_owned(), has_headers: false, delimiter: b',', quote: b'"', offset_first_row: 0, }; - let mut cols: Vec = Vec::new(); + let mut schema = None; + let mut n_col = None; - let args = &args[4..]; + let args = &args[3..]; for c_slice in args { - if c_slice.len() == 1 { - vtab.delimiter = c_slice[0]; - } else if c_slice.len() == 3 && c_slice[0] == b'\'' { - vtab.delimiter = c_slice[1]; + let (param, value) = try!(CSVTab::parameter(c_slice)); + match param { + "filename" => { + if !Path::new(value).exists() { + return Err(Error::ModuleError(format!("file '{}' does not exist", value))); + } + vtab.filename = value.to_owned(); + }, + "schema" => { + schema = Some(value.to_owned()); + }, + "columns" => { + if let Ok(n) = value.parse::() { + if n_col.is_some() { + return Err(Error::ModuleError("more than one 'columns' parameter".to_owned())); + } else if n == 0 { + return Err(Error::ModuleError("must have at least one column".to_owned())); + } + n_col = Some(n); + } else { + return Err(Error::ModuleError(format!("unrecognized argument to 'columns': {}", value))); + } + }, + "header" => { + if let Some(b) = parse_boolean(value) { + vtab.has_headers = b; + } else { + return Err(Error::ModuleError(format!("unrecognized argument to 'header': {}", value))); + } + }, + "delimiter" => { + if let Some(b) = CSVTab::parse_byte(value) { + vtab.delimiter = b; + } else { + return Err(Error::ModuleError(format!("unrecognized argument to 'delimiter': {}", value))); + } + }, + "quote" => { + if let Some(b) = CSVTab::parse_byte(value) { + if b == b'0' { + vtab.quote = 0; + } else { + vtab.quote = b; + } + } else { + return Err(Error::ModuleError(format!("unrecognized argument to 'quote': {}", value))); + } + }, + _ => { + return Err(Error::ModuleError(format!("unrecognized parameter '{}'", param))); + }, + } + } + + if vtab.filename.is_empty() { + return Err(Error::ModuleError("no CSV file specified".to_owned())); + } + + let mut cols: Vec = Vec::new(); + if vtab.has_headers || (n_col.is_none() && schema.is_none()) { + let mut reader = try!(vtab.reader()); + if vtab.has_headers { + let headers = try!(reader.headers()); + vtab.offset_first_row = reader.byte_offset(); + // headers ignored if cols is not empty + if n_col.is_none() && schema.is_none() { + cols = headers.into_iter().map(|header| escape_double_quote(&header).into_owned()).collect(); + } } else { - let arg = try!(str::from_utf8(c_slice)); - let uc = arg.to_uppercase(); - if uc.contains("HEADER") { - vtab.has_headers = true; - } else if uc.contains("NO_QUOTE") { - vtab.quote = 0; - } else { - cols.push(escape_double_quote(arg).into_owned()); + let mut count = 0; + while let Some(col) = reader.next_bytes().into_iter_result() { + try!(col); + cols.push(format!("c{}", count)); + count+=1; } } } - if vtab.has_headers { - let mut reader = try!(vtab.reader()); - let headers = try!(reader.headers()); - vtab.offset_first_row = reader.byte_offset(); - // headers ignored if cols is not empty - if cols.is_empty() { - cols = headers; - } + if cols.is_empty() && schema.is_none() { + return Err(Error::ModuleError("no column specified".to_owned())); } - if cols.is_empty() { - return Err(Error::ModuleError("no column name specified".to_owned())); + if schema.is_none() { + let mut sql = String::from("CREATE TABLE x("); + for (i, col) in cols.iter().enumerate() { + sql.push('"'); + sql.push_str(col); + sql.push_str("\" TEXT"); + if i == cols.len() - 1 { + sql.push_str(");"); + } else { + sql.push_str(", "); + } + } + schema = Some(sql); } - let mut sql = String::from("CREATE TABLE x("); - for (i, col) in cols.iter().enumerate() { - if col.is_empty() { - return Err(Error::ModuleError("no column name found".to_owned())); - } - sql.push('"'); - sql.push_str(col); - sql.push_str("\" TEXT"); - if i == cols.len() - 1 { - sql.push_str(");"); - } else { - sql.push_str(", "); - } - } - - try!(declare_vtab(db, &sql)); + try!(declare_vtab(db, &schema.unwrap())); Ok(vtab) } - fn best_index(&self, _info: &mut IndexInfo) -> Result<()> { + // Only a forward full table scan is supported. + fn best_index(&self, info: &mut IndexInfo) -> Result<()> { + info.set_estimated_cost(1000000.); Ok(()) } @@ -149,8 +227,9 @@ struct CSVTabCursor { base: ffi::sqlite3_vtab_cursor, /// The CSV reader object reader: csv::Reader, - /// Current cursor position + /// Current cursor position used as rowid row_number: usize, + /// Values of the current row cols: Vec, eof: bool, } @@ -172,6 +251,8 @@ impl VTabCursor for CSVTabCursor { unsafe { & *(self.base.pVtab as *const CSVTab) } } + // Only a full table scan is supported. So `filter` simply rewinds to + // the beginning. fn filter(&mut self, _idx_num: c_int, _idx_str: Option<&str>, @@ -236,7 +317,7 @@ mod test { fn test_csv_module() { let db = Connection::open_in_memory().unwrap(); csvtab::load_module(&db).unwrap(); - db.execute_batch("CREATE VIRTUAL TABLE vtab USING csv('test.csv', HAS_HEADERS)").unwrap(); + db.execute_batch("CREATE VIRTUAL TABLE vtab USING csv(filename='test.csv', header=yes)").unwrap(); { let mut s = db.prepare("SELECT rowid, * FROM vtab").unwrap(); @@ -257,7 +338,7 @@ mod test { fn test_csv_cursor() { let db = Connection::open_in_memory().unwrap(); csvtab::load_module(&db).unwrap(); - db.execute_batch("CREATE VIRTUAL TABLE vtab USING csv('test.csv', HAS_HEADERS)").unwrap(); + db.execute_batch("CREATE VIRTUAL TABLE vtab USING csv(filename='test.csv', header=yes)").unwrap(); { let mut s = diff --git a/src/vtab/mod.rs b/src/vtab/mod.rs index 5bc27a5..c74f0f3 100644 --- a/src/vtab/mod.rs +++ b/src/vtab/mod.rs @@ -331,6 +331,35 @@ pub fn escape_double_quote(identifier: &str) -> Cow { Borrowed(identifier) } } +/// Dequote string +pub fn dequote(s: &str) -> &str { + if s.len() < 2 { + return s; + } + match s.bytes().next() { + Some(b) if b == b'"' || b == b'\'' => { + match s.bytes().rev().next() { + Some(e) if e == b => &s[1..s.len()-1], + _ => s, + } + }, + _ => s, + } +} +/// The boolean can be one of: +/// ```text +/// 1 yes true on +/// 0 no false off +/// ``` +pub fn parse_boolean(s: &str) -> Option { + if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("on") || s.eq_ignore_ascii_case("true") || s.eq("1") { + Some(true) + } else if s.eq_ignore_ascii_case("no") || s.eq_ignore_ascii_case("off") || s.eq_ignore_ascii_case("false") || s.eq("0") { + Some(false) + } else { + None + } +} // FIXME copy/paste from function.rs unsafe extern "C" fn free_boxed_value(p: *mut c_void) { @@ -665,3 +694,31 @@ pub mod int_array; pub mod csvtab; #[cfg(feature = "bundled")] pub mod series; + +#[cfg(test)] +mod test { + #[test] + fn test_dequote() { + assert_eq!("", super::dequote("")); + assert_eq!("'", super::dequote("'")); + assert_eq!("\"", super::dequote("\"")); + assert_eq!("'\"", super::dequote("'\"")); + assert_eq!("", super::dequote("''")); + assert_eq!("", super::dequote("\"\"")); + assert_eq!("x", super::dequote("'x'")); + assert_eq!("x", super::dequote("\"x\"")); + assert_eq!("x", super::dequote("x")); + } + #[test] + fn test_parse_boolean() { + assert_eq!(None, super::parse_boolean("")); + assert_eq!(Some(true), super::parse_boolean("1")); + assert_eq!(Some(true), super::parse_boolean("yes")); + assert_eq!(Some(true), super::parse_boolean("on")); + assert_eq!(Some(true), super::parse_boolean("true")); + assert_eq!(Some(false), super::parse_boolean("0")); + assert_eq!(Some(false), super::parse_boolean("no")); + assert_eq!(Some(false), super::parse_boolean("off")); + assert_eq!(Some(false), super::parse_boolean("false")); + } +} \ No newline at end of file