Browse Source

better per-record error handling

master
boB Rudis 8 years ago
parent
commit
a74e12dc95
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 14
      DESCRIPTION
  2. 4
      NEWS.md
  3. 204
      src/ndjson.cpp

14
DESCRIPTION

@ -1,18 +1,18 @@
Package: ndjson
Type: Package
Title: Wicked-Fast Streaming 'JSON' ('ndjson') Reader
Version: 0.3.0.0
Date: 2016-09-14
Author: Bob Rudis (@hrbrmstr), Niels Lohmann (C++ json parser),
Version: 0.4.0
Date: 2016-11-18
Author: Bob Rudis (bob@rud.is), Niels Lohmann (C++ json parser),
Deepak Bandyopadhyay (C++ gzstream), Lutz Kettner (C++ gzstream)
Maintainer: Bob Rudis <bob@rud.is>
Description: Streaming 'JSON' ('ndjson') has one 'JSON' record per-line and many modern
'ndjson' files contain large numbers of records. These constructs may not be
columnar in nature, but it's often useful to read in these files and "flatten"
the structure out to work in an R data.frame-like context. Functions are provided that
make it possible to read in plain 'ndjson' files or compressed ('gz') 'ndjson'
files and either validate the format of the records or create "flat" data.table
('tbl_dt') structures from them.
the structure out to enable working with the data in an R data.frame-like context.
Functions are provided that make it possible to read in plain 'ndjson' files or
compressed ('gz') 'ndjson' files and either validate the format of the records or
create "flat" data.table ('tbl_dt') structures from them.
URL: http://gitlab.com/hrbrmstr/ndjson
BugReports: https://gitlab.com/hrbrmstr/ndjson/issues
SystemRequirements: zlib, C++11

4
NEWS.md

@ -1,3 +1,7 @@
0.4.0
=====================
* Gracefull handles parsing errors when streaming in data
0.3.0
=====================
* PR from Dirk to remove unnecessary dependency on Rcpp11

204
src/ndjson.cpp

@ -26,53 +26,59 @@ List gz_stream_in(const std::string &path) {
in.close();
List container(num_lines);
R_xlen_t j=0;
R_xlen_t k=0;
igzstream in2;
in2.open(path.c_str());
while(std::getline(in2, line)) {
json o = json::parse(line).flatten();
try {
List lst(o.size());
CharacterVector lst_nms(o.size());
json o = json::parse(line).flatten();
double d_val;
std::string s_val;
bool b_val;
List lst(o.size());
CharacterVector lst_nms(o.size());
R_xlen_t i=0;
for (json::iterator it = o.begin(); it != o.end(); ++it) {
double d_val;
std::string s_val;
bool b_val;
std::string key = it.key();
std::replace(key.begin(), key.end(), '/', '.');
key.erase(0, 1);
R_xlen_t i=0;
for (json::iterator it = o.begin(); it != o.end(); ++it) {
lst_nms[i] = key;
std::string key = it.key();
std::replace(key.begin(), key.end(), '/', '.');
key.erase(0, 1);
if (it.value().is_number()) {
d_val = it.value();
lst[i] = d_val;
} else if (it.value().is_boolean()) {
b_val = it.value();
lst[i] = b_val;
} else if (it.value().is_string()) {
s_val = it.value();
lst[i] = s_val;
} else if (it.value().is_null()) {
lst[i] = NA_LOGICAL;
}
lst_nms[i] = key;
i += 1;
if (it.value().is_number()) {
d_val = it.value();
lst[i] = d_val;
} else if (it.value().is_boolean()) {
b_val = it.value();
lst[i] = b_val;
} else if (it.value().is_string()) {
s_val = it.value();
lst[i] = s_val;
} else if (it.value().is_null()) {
lst[i] = NA_LOGICAL;
}
}
i += 1;
}
lst.attr("names") = lst_nms;
lst.attr("class") = "data.frame";
lst.attr("row.names") = 1;
lst.attr("names") = lst_nms;
lst.attr("class") = "data.frame";
lst.attr("row.names") = 1;
container[j++] = lst;
container[k++] = lst;
} catch(...) {
Rcpp::warning("Parsing error on line " + std::to_string(k));
}
}
@ -87,52 +93,60 @@ List internal_flatten(CharacterVector lines) {
R_xlen_t num_lines = lines.size();
List container(num_lines);
R_xlen_t j=0;
R_xlen_t j=0, k=0;
while(j < num_lines) {
std::string line = as<std::string>(lines[j]);
json o = json::parse(line).flatten();
try {
json o = json::parse(line).flatten();
List lst(o.size());
CharacterVector lst_nms(o.size());
List lst(o.size());
CharacterVector lst_nms(o.size());
double d_val;
std::string s_val;
bool b_val;
double d_val;
std::string s_val;
bool b_val;
R_xlen_t i=0;
for (json::iterator it = o.begin(); it != o.end(); ++it) {
R_xlen_t i=0;
for (json::iterator it = o.begin(); it != o.end(); ++it) {
std::string key = it.key();
std::replace(key.begin(), key.end(), '/', '.');
key.erase(0, 1);
std::string key = it.key();
std::replace(key.begin(), key.end(), '/', '.');
key.erase(0, 1);
lst_nms[i] = key;
lst_nms[i] = key;
if (it.value().is_number()) {
d_val = it.value();
lst[i] = d_val;
} else if (it.value().is_boolean()) {
b_val = it.value();
lst[i] = b_val;
} else if (it.value().is_string()) {
s_val = it.value();
lst[i] = s_val;
} else if (it.value().is_null()) {
lst[i] = NA_LOGICAL;
}
i += 1;
if (it.value().is_number()) {
d_val = it.value();
lst[i] = d_val;
} else if (it.value().is_boolean()) {
b_val = it.value();
lst[i] = b_val;
} else if (it.value().is_string()) {
s_val = it.value();
lst[i] = s_val;
} else if (it.value().is_null()) {
lst[i] = NA_LOGICAL;
}
i += 1;
lst.attr("names") = lst_nms;
lst.attr("class") = "data.frame";
lst.attr("row.names") = 1;
}
container[k++] = lst;
lst.attr("names") = lst_nms;
lst.attr("class") = "data.frame";
lst.attr("row.names") = 1;
} catch(...) {
Rcpp::warning("Parsing error on line " + std::to_string(j));
}
container[j++] = lst;
j++;
}
@ -149,52 +163,58 @@ List j_stream_in(const std::string &path) {
in.close();
List container(num_lines);
R_xlen_t j=0;
R_xlen_t k=0;
in.open(path);
while(getline(in, line)) {
json o = json::parse(line).flatten();
try {
List lst(o.size());
CharacterVector lst_nms(o.size());
json o = json::parse(line).flatten();
double d_val;
std::string s_val;
bool b_val;
List lst(o.size());
CharacterVector lst_nms(o.size());
R_xlen_t i=0;
for (json::iterator it = o.begin(); it != o.end(); ++it) {
double d_val;
std::string s_val;
bool b_val;
std::string key = it.key();
std::replace(key.begin(), key.end(), '/', '.');
key.erase(0, 1);
R_xlen_t i=0;
for (json::iterator it = o.begin(); it != o.end(); ++it) {
lst_nms[i] = key;
std::string key = it.key();
std::replace(key.begin(), key.end(), '/', '.');
key.erase(0, 1);
if (it.value().is_number()) {
d_val = it.value();
lst[i] = d_val;
} else if (it.value().is_boolean()) {
b_val = it.value();
lst[i] = b_val;
} else if (it.value().is_string()) {
s_val = it.value();
lst[i] = s_val;
} else if (it.value().is_null()) {
lst[i] = NA_LOGICAL;
}
lst_nms[i] = key;
i += 1;
if (it.value().is_number()) {
d_val = it.value();
lst[i] = d_val;
} else if (it.value().is_boolean()) {
b_val = it.value();
lst[i] = b_val;
} else if (it.value().is_string()) {
s_val = it.value();
lst[i] = s_val;
} else if (it.value().is_null()) {
lst[i] = NA_LOGICAL;
}
}
i += 1;
}
lst.attr("names") = lst_nms;
lst.attr("class") = "data.frame";
lst.attr("row.names") = 1;
lst.attr("names") = lst_nms;
lst.attr("class") = "data.frame";
lst.attr("row.names") = 1;
container[j++] = lst;
container[k++] = lst;
} catch(...) {
Rcpp::warning("Parsing error on line " + std::to_string(k));
}
}

Loading…
Cancel
Save