rusqlite/src/vtab/csvtab.rs

408 lines
12 KiB
Rust
Raw Normal View History

2021-06-13 15:17:35 +08:00
//! CSV Virtual Table.
2018-07-15 16:19:18 +08:00
//!
//! Port of [csv](http://www.sqlite.org/cgi/src/finfo?name=ext/misc/csv.c) C
2020-11-07 19:32:41 +08:00
//! extension: `https://www.sqlite.org/csv.html`
//!
//! # Example
//!
//! ```rust,no_run
//! # use rusqlite::{Connection, Result};
//! fn example() -> Result<()> {
//! // Note: This should be done once (usually when opening the DB).
//! let db = Connection::open_in_memory()?;
//! rusqlite::vtab::csvtab::load_module(&db)?;
//! // Assum3e my_csv.csv
//! let schema = "
//! CREATE VIRTUAL TABLE my_csv_data
//! USING csv(filename = 'my_csv.csv')
//! ";
//! db.execute_batch(schema)?;
//! // Now the `my_csv_data` (virtual) table can be queried as normal...
//! Ok(())
//! }
//! ```
use std::fs::File;
use std::marker::PhantomData;
2018-07-15 00:47:52 +08:00
use std::os::raw::c_int;
use std::path::Path;
2016-02-11 03:30:08 +08:00
use std::str;
2018-10-31 03:11:35 +08:00
use crate::ffi;
use crate::types::Null;
use crate::vtab::{
dequote, escape_double_quote, parse_boolean, read_only_module, Context, CreateVTab, IndexInfo,
2020-04-14 22:09:50 +08:00
VTab, VTabConnection, VTabCursor, Values,
2018-05-06 23:21:36 +08:00
};
2018-10-31 03:11:35 +08:00
use crate::{Connection, Error, Result};
2021-06-13 15:17:35 +08:00
/// Register the "csv" module.
2018-05-06 18:41:42 +08:00
/// ```sql
/// CREATE VIRTUAL TABLE vtab USING csv(
/// filename=FILENAME -- Name of file containing CSV content
/// [, schema=SCHEMA] -- Alternative CSV schema. 'CREATE TABLE x(col1 TEXT NOT NULL, col2 INT, ...);'
/// [, header=YES|NO] -- First row of CSV defines the names of columns if "yes". Default "no".
/// [, columns=N] -- Assume the CSV file contains N columns.
/// [, delimiter=C] -- CSV delimiter. Default ','.
/// [, quote=C] -- CSV quote. Default '"'. 0 means no quote.
/// );
/// ```
pub fn load_module(conn: &Connection) -> Result<()> {
let aux: Option<()> = None;
conn.create_module("csv", read_only_module::<CsvTab>(), aux)
2018-07-15 00:47:52 +08:00
}
/// An instance of the CSV virtual table
#[repr(C)]
struct CsvTab {
2018-07-15 00:47:52 +08:00
/// Base class. Must be first
base: ffi::sqlite3_vtab,
/// Name of the CSV file
filename: String,
has_headers: bool,
delimiter: u8,
quote: u8,
/// Offset to start of data
offset_first_row: csv::Position,
}
impl CsvTab {
fn reader(&self) -> Result<csv::Reader<File>, csv::Error> {
2018-07-15 00:47:52 +08:00
csv::ReaderBuilder::new()
.has_headers(self.has_headers)
.delimiter(self.delimiter)
.quote(self.quote)
.from_path(&self.filename)
}
2018-05-06 18:41:42 +08:00
fn parameter(c_slice: &[u8]) -> Result<(&str, &str)> {
2018-10-31 03:11:35 +08:00
let arg = str::from_utf8(c_slice)?.trim();
2018-05-06 18:41:42 +08:00
let mut split = arg.split('=');
if let Some(key) = split.next() {
if let Some(value) = split.next() {
let param = key.trim();
let value = dequote(value);
return Ok((param, value));
}
}
Err(Error::ModuleError(format!("illegal argument: '{}'", arg)))
}
fn parse_byte(arg: &str) -> Option<u8> {
if arg.len() == 1 {
arg.bytes().next()
} else {
None
}
}
}
unsafe impl<'vtab> VTab<'vtab> for CsvTab {
2018-05-07 00:05:02 +08:00
type Aux = ();
type Cursor = CsvTabCursor<'vtab>;
2018-06-22 23:20:47 +08:00
fn connect(
2018-07-10 00:53:52 +08:00
_: &mut VTabConnection,
2018-06-22 23:20:47 +08:00
_aux: Option<&()>,
args: &[&[u8]],
) -> Result<(String, CsvTab)> {
2016-02-11 03:30:08 +08:00
if args.len() < 4 {
2016-04-02 23:16:17 +08:00
return Err(Error::ModuleError("no CSV file specified".to_owned()));
2016-02-11 01:07:58 +08:00
}
2018-05-06 18:41:42 +08:00
let mut vtab = CsvTab {
2018-06-29 03:07:05 +08:00
base: ffi::sqlite3_vtab::default(),
2018-05-06 18:41:42 +08:00
filename: "".to_owned(),
has_headers: false,
delimiter: b',',
quote: b'"',
2018-05-13 18:21:58 +08:00
offset_first_row: csv::Position::new(),
};
2018-05-06 18:41:42 +08:00
let mut schema = None;
let mut n_col = None;
2016-02-11 03:30:08 +08:00
2018-05-06 18:41:42 +08:00
let args = &args[3..];
2016-08-13 17:54:19 +08:00
for c_slice in args {
let (param, value) = CsvTab::parameter(c_slice)?;
2018-05-06 18:41:42 +08:00
match param {
"filename" => {
if !Path::new(value).exists() {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(format!(
"file '{}' does not exist",
value
)));
2018-05-06 18:41:42 +08:00
}
vtab.filename = value.to_owned();
2018-05-06 23:21:36 +08:00
}
2018-05-06 18:41:42 +08:00
"schema" => {
schema = Some(value.to_owned());
2018-05-06 23:21:36 +08:00
}
2018-05-06 18:41:42 +08:00
"columns" => {
if let Ok(n) = value.parse::<u16>() {
if n_col.is_some() {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(
"more than one 'columns' parameter".to_owned(),
));
2018-05-06 18:41:42 +08:00
} else if n == 0 {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(
"must have at least one column".to_owned(),
));
2018-05-06 18:41:42 +08:00
}
n_col = Some(n);
} else {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(format!(
"unrecognized argument to 'columns': {}",
value
)));
2018-05-06 18:41:42 +08:00
}
2018-05-06 23:21:36 +08:00
}
2018-05-06 18:41:42 +08:00
"header" => {
if let Some(b) = parse_boolean(value) {
vtab.has_headers = b;
} else {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(format!(
"unrecognized argument to 'header': {}",
value
)));
2018-05-06 18:41:42 +08:00
}
2018-05-06 23:21:36 +08:00
}
2018-05-06 18:41:42 +08:00
"delimiter" => {
if let Some(b) = CsvTab::parse_byte(value) {
2018-05-06 18:41:42 +08:00
vtab.delimiter = b;
} else {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(format!(
"unrecognized argument to 'delimiter': {}",
value
)));
2018-05-06 18:41:42 +08:00
}
2018-05-06 23:21:36 +08:00
}
2018-05-06 18:41:42 +08:00
"quote" => {
if let Some(b) = CsvTab::parse_byte(value) {
2018-05-06 18:41:42 +08:00
if b == b'0' {
vtab.quote = 0;
} else {
vtab.quote = b;
}
} else {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(format!(
"unrecognized argument to 'quote': {}",
value
)));
2018-05-06 18:41:42 +08:00
}
2018-05-06 23:21:36 +08:00
}
2018-05-06 18:41:42 +08:00
_ => {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(format!(
"unrecognized parameter '{}'",
param
)));
}
2016-02-11 03:30:08 +08:00
}
}
2018-05-06 18:41:42 +08:00
if vtab.filename.is_empty() {
return Err(Error::ModuleError("no CSV file specified".to_owned()));
}
let mut cols: Vec<String> = Vec::new();
if vtab.has_headers || (n_col.is_none() && schema.is_none()) {
2018-10-31 03:11:35 +08:00
let mut reader = vtab.reader()?;
2018-05-06 18:41:42 +08:00
if vtab.has_headers {
2018-05-13 18:21:58 +08:00
{
2018-10-31 03:11:35 +08:00
let headers = reader.headers()?;
2018-05-13 18:21:58 +08:00
// headers ignored if cols is not empty
if n_col.is_none() && schema.is_none() {
cols = headers
.into_iter()
2021-06-17 01:22:31 +08:00
.map(|header| escape_double_quote(header).into_owned())
2018-05-13 18:21:58 +08:00
.collect();
}
2018-05-06 18:41:42 +08:00
}
2018-05-13 18:21:58 +08:00
vtab.offset_first_row = reader.position().clone();
2018-05-06 18:41:42 +08:00
} else {
2018-05-13 18:21:58 +08:00
let mut record = csv::ByteRecord::new();
2018-10-31 03:11:35 +08:00
if reader.read_byte_record(&mut record)? {
2018-05-14 01:16:12 +08:00
for (i, _) in record.iter().enumerate() {
cols.push(format!("c{}", i));
2018-05-13 18:21:58 +08:00
}
2018-05-06 18:41:42 +08:00
}
2016-02-11 03:30:08 +08:00
}
2018-05-15 01:23:17 +08:00
} else if let Some(n_col) = n_col {
for i in 0..n_col {
cols.push(format!("c{}", i));
}
2016-02-11 03:30:08 +08:00
}
2018-05-06 18:41:42 +08:00
if cols.is_empty() && schema.is_none() {
return Err(Error::ModuleError("no column specified".to_owned()));
2016-02-11 03:30:08 +08:00
}
2018-05-06 18:41:42 +08:00
if schema.is_none() {
let mut sql = String::from("CREATE TABLE x(");
for (i, col) in cols.iter().enumerate() {
sql.push('"');
sql.push_str(col);
sql.push_str("\" TEXT");
if i == cols.len() - 1 {
sql.push_str(");");
} else {
sql.push_str(", ");
}
2016-02-11 03:30:08 +08:00
}
2018-05-06 18:41:42 +08:00
schema = Some(sql);
2016-02-11 03:30:08 +08:00
}
2019-10-13 19:08:33 +08:00
Ok((schema.unwrap(), vtab))
}
2018-05-06 18:41:42 +08:00
// Only a forward full table scan is supported.
fn best_index(&self, info: &mut IndexInfo) -> Result<()> {
2018-05-14 01:16:12 +08:00
info.set_estimated_cost(1_000_000.);
2016-08-14 15:44:37 +08:00
Ok(())
}
fn open(&self) -> Result<CsvTabCursor<'_>> {
Ok(CsvTabCursor::new(self.reader()?))
}
}
impl CreateVTab<'_> for CsvTab {}
2016-08-13 17:54:19 +08:00
/// A cursor for the CSV virtual table
#[repr(C)]
struct CsvTabCursor<'vtab> {
2016-08-13 17:54:19 +08:00
/// Base class. Must be first
base: ffi::sqlite3_vtab_cursor,
2016-08-13 17:54:19 +08:00
/// The CSV reader object
reader: csv::Reader<File>,
2018-05-06 18:41:42 +08:00
/// Current cursor position used as rowid
row_number: usize,
2018-05-06 18:41:42 +08:00
/// Values of the current row
2018-05-13 18:21:58 +08:00
cols: csv::StringRecord,
eof: bool,
phantom: PhantomData<&'vtab CsvTab>,
}
impl CsvTabCursor<'_> {
fn new<'vtab>(reader: csv::Reader<File>) -> CsvTabCursor<'vtab> {
CsvTabCursor {
2018-06-29 03:07:05 +08:00
base: ffi::sqlite3_vtab_cursor::default(),
2018-05-06 14:45:56 +08:00
reader,
row_number: 0,
2018-05-13 18:21:58 +08:00
cols: csv::StringRecord::new(),
eof: false,
phantom: PhantomData,
}
}
/// Accessor to the associated virtual table.
fn vtab(&self) -> &CsvTab {
unsafe { &*(self.base.pVtab as *const CsvTab) }
}
}
unsafe impl VTabCursor for CsvTabCursor<'_> {
2018-05-06 18:41:42 +08:00
// Only a full table scan is supported. So `filter` simply rewinds to
// the beginning.
2018-12-08 04:57:04 +08:00
fn filter(
&mut self,
_idx_num: c_int,
_idx_str: Option<&str>,
_args: &Values<'_>,
) -> Result<()> {
{
2018-05-13 18:21:58 +08:00
let offset_first_row = self.vtab().offset_first_row.clone();
2018-10-31 03:11:35 +08:00
self.reader.seek(offset_first_row)?;
}
self.row_number = 0;
self.next()
}
2018-08-17 00:29:46 +08:00
fn next(&mut self) -> Result<()> {
2016-02-11 03:48:30 +08:00
{
2018-05-13 18:21:58 +08:00
self.eof = self.reader.is_done();
if self.eof {
return Ok(());
2016-02-11 03:48:30 +08:00
}
2018-10-31 03:11:35 +08:00
self.eof = !self.reader.read_record(&mut self.cols)?;
}
2016-02-11 03:48:30 +08:00
2016-08-13 19:55:30 +08:00
self.row_number += 1;
Ok(())
}
2018-08-17 00:29:46 +08:00
fn eof(&self) -> bool {
self.eof
}
2018-08-17 00:29:46 +08:00
fn column(&self, ctx: &mut Context, col: c_int) -> Result<()> {
if col < 0 || col as usize >= self.cols.len() {
2018-05-06 23:21:36 +08:00
return Err(Error::ModuleError(format!(
"column index out of bounds: {}",
col
)));
2016-02-11 01:07:58 +08:00
}
if self.cols.is_empty() {
return ctx.set_result(&Null);
2016-02-11 01:07:58 +08:00
}
// TODO Affinity
ctx.set_result(&self.cols[col as usize].to_owned())
}
2018-08-17 00:29:46 +08:00
fn rowid(&self) -> Result<i64> {
Ok(self.row_number as i64)
}
}
impl From<csv::Error> for Error {
#[cold]
fn from(err: csv::Error) -> Error {
Error::ModuleError(err.to_string())
}
}
#[cfg(test)]
mod test {
2018-10-31 03:11:35 +08:00
use crate::vtab::csvtab;
use crate::{Connection, Result};
2019-03-20 03:45:04 +08:00
use fallible_iterator::FallibleIterator;
#[test]
2020-11-06 05:14:00 +08:00
fn test_csv_module() -> Result<()> {
let db = Connection::open_in_memory()?;
csvtab::load_module(&db)?;
db.execute_batch("CREATE VIRTUAL TABLE vtab USING csv(filename='test.csv', header=yes)")?;
{
2020-11-06 05:14:00 +08:00
let mut s = db.prepare("SELECT rowid, * FROM vtab")?;
{
let headers = s.column_names();
assert_eq!(vec!["rowid", "colA", "colB", "colC"], headers);
}
2020-11-06 05:14:00 +08:00
let ids: Result<Vec<i32>> = s.query([])?.map(|row| row.get::<_, i32>(0)).collect();
let sum = ids?.iter().sum::<i32>();
assert_eq!(sum, 15);
}
2020-11-06 05:14:00 +08:00
db.execute_batch("DROP TABLE vtab")
}
#[test]
2020-11-06 05:14:00 +08:00
fn test_csv_cursor() -> Result<()> {
let db = Connection::open_in_memory()?;
csvtab::load_module(&db)?;
db.execute_batch("CREATE VIRTUAL TABLE vtab USING csv(filename='test.csv', header=yes)")?;
{
2020-11-06 05:14:00 +08:00
let mut s = db.prepare(
"SELECT v1.rowid, v1.* FROM vtab v1 NATURAL JOIN vtab v2 WHERE \
v1.rowid < v2.rowid",
2020-11-06 05:14:00 +08:00
)?;
2020-11-06 05:14:00 +08:00
let mut rows = s.query([])?;
let row = rows.next()?.unwrap();
assert_eq!(row.get_unwrap::<_, i32>(0), 2);
}
2020-11-06 05:14:00 +08:00
db.execute_batch("DROP TABLE vtab")
}
}