Browse Source

- Lowercased headers

- Fixed parser for when no `.` in HTTP version response and no status
  text
- added buffer size parameter to file reader
batman
boB Rudis 4 years ago
parent
commit
b172d7df0b
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 63
      R/RcppExports.R
  2. 24
      README.Rmd
  3. 13
      man/parse_request.Rd
  4. 16
      man/parse_response.Rd
  5. 4
      man/parse_url.Rd
  6. 13
      man/read_file_raw.Rd
  7. 45
      src/RcppExports.cpp
  8. 75
      src/code.cpp
  9. 67
      src/httpresponseparser.h

63
R/RcppExports.R

@ -3,49 +3,86 @@
#' Parse an HTTP request
#'
#' You can use the non- `_raw` version on input you know for sure has is plain text
#' You can use the non- `_raw` version on input you know for sure is plain text
#'
#' @param req HTTP request character string
#' @param headers_lowercase if `TRUE` (the default) names in the `headers` data frame
#' element are converted to lower case
#' @export
parse_request <- function(req) {
.Call(`_construe_parse_request`, req)
#' @examples
#' paste0(c(
#' "GET /uri.cgi HTTP/1.1\r\n",
#' "User-Agent: Mozilla/5.0\r\n",
#' "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n",
#' "Host: 127.0.0.1\r\n", "\r\n"
#' ), collapse = "") -> req
#'
#' res <- parse_request(req)
#' res <- parse_request_raw(charToRaw(req))
parse_request <- function(req, headers_lowercase = TRUE) {
.Call(`_construe_parse_request`, req, headers_lowercase)
}
#' @rdname parse_request
#' @export
parse_request_raw <- function(req) {
.Call(`_construe_parse_request_raw`, req)
parse_request_raw <- function(req, headers_lowercase = TRUE) {
.Call(`_construe_parse_request_raw`, req, headers_lowercase)
}
#' Parse an HTTP response
#'
#' You can use the non- `_raw` version on input you know for sure has is plain text
#' You can use the non- `_raw` version on input you know for sure is plain text
#'
#' @param resp HTTP response character string
#' @param headers_lowercase if `TRUE` (the default) names in the `headers` data frame
#' element are converted to lower case
#' @export
parse_response <- function(resp) {
.Call(`_construe_parse_response`, resp)
#' @examples
#' paste0(c(
#' "HTTP/1.1 200 OK\r\n",
#' "Server: nginx/1.2.1\r\n",
#' "Content-Type: text/html\r\n",
#' "Content-Length: 8\r\n",
#' "Connection: keep-alive\r\n",
#' "\r\n",
#' "<html />"
#' ), collapse = "") -> resp
#'
#' res <- parse_response(resp)
#' res <- parse_response_raw(charToRaw(resp))
parse_response <- function(resp, headers_lowercase = TRUE) {
.Call(`_construe_parse_response`, resp, headers_lowercase)
}
#' @rdname parse_response
#' @export
parse_response_raw <- function(resp) {
.Call(`_construe_parse_response_raw`, resp)
parse_response_raw <- function(resp, headers_lowercase = TRUE) {
.Call(`_construe_parse_response_raw`, resp, headers_lowercase)
}
#' Parse URLs
#'
#' @param urls character vector of URLs
#' @export
#' @examples
#' URL <- "http://www.example.com/dir/subdir?param=1&param=2;param%20=%20#fragment"
#' parse_url(URL)
parse_url <- function(urls) {
.Call(`_construe_parse_url`, urls)
}
#' Read in a file, fast and raw
#'
#' @param fil file to read in (no path expansion is performed)a
#' @param fil file to read in (no path expansion is performed)
#' @param buffer_size larger buffer sizes may speed up reading of
#' very large files. It can also hurt performance, and this
#' function reads in the entire file into memory, so a
#' large buffer size also means more (temporary) memory will
#' be allocated.
#' @export
read_file_raw <- function(fil) {
.Call(`_construe_read_file_raw`, fil)
#' @examples
#' read_file_raw(system.file("extdat", "example.hdr", package = "construe"))
read_file_raw <- function(fil, buffer_size = 16384L) {
.Call(`_construe_read_file_raw`, fil, buffer_size)
}

24
README.Rmd

@ -87,6 +87,30 @@ microbenchmark::microbenchmark(
)
```
### curl output example
`HEAD` request:
```{r curl-01}
sys::exec_internal(
cmd = "curl",
args = c("--include", "--head", "--silent", "https://httpbin.org/")
) -> res
rawToChar(res$stdout)
```
`GET` request:
```{r curl-02}
sys::exec_internal(
cmd = "curl",
args = c("--include", "--silent", "https://httpbin.org/")
) -> res
parse_response_raw(res$stdout)
```
### URLs
```{r ex03}

13
man/parse_request.Rd

@ -13,5 +13,16 @@ parse_request_raw(req)
\item{req}{HTTP request character string}
}
\description{
You can use the non- \verb{_raw} version on input you know for sure has is plain text
You can use the non- \verb{_raw} version on input you know for sure is plain text
}
\examples{
paste0(c(
"GET /uri.cgi HTTP/1.1\r\n",
"User-Agent: Mozilla/5.0\r\n",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n",
"Host: 127.0.0.1\r\n", "\r\n"
), collapse = "") -> req
res <- parse_request(req)
res <- parse_request_raw(charToRaw(req))
}

16
man/parse_response.Rd

@ -13,5 +13,19 @@ parse_response_raw(resp)
\item{resp}{HTTP response character string}
}
\description{
You can use the non- \verb{_raw} version on input you know for sure has is plain text
You can use the non- \verb{_raw} version on input you know for sure is plain text
}
\examples{
paste0(c(
"HTTP/1.1 200 OK\r\n",
"Server: nginx/1.2.1\r\n",
"Content-Type: text/html\r\n",
"Content-Length: 8\r\n",
"Connection: keep-alive\r\n",
"\r\n",
"<html />"
), collapse = "") -> resp
res <- parse_response(resp)
res <- parse_response_raw(charToRaw(resp))
}

4
man/parse_url.Rd

@ -12,3 +12,7 @@ parse_url(urls)
\description{
Parse URLs
}
\examples{
URL <- "http://www.example.com/dir/subdir?param=1&param=2;param\%20=\%20#fragment"
parse_url(URL)
}

13
man/read_file_raw.Rd

@ -4,11 +4,20 @@
\alias{read_file_raw}
\title{Read in a file, fast and raw}
\usage{
read_file_raw(fil)
read_file_raw(fil, buffer_size = 16384L)
}
\arguments{
\item{fil}{file to read in (no path expansion is performed)a}
\item{fil}{file to read in (no path expansion is performed)}
\item{buffer_size}{larger buffer sizes may speed up reading of
very large files. It can also hurt performance, and this
function reads in the entire file into memory, so a
large buffer size also means more (temporary) memory will
be allocated.}
}
\description{
Read in a file, fast and raw
}
\examples{
read_file_raw(system.file("extdat", "example.hdr", package = "construe"))
}

45
src/RcppExports.cpp

@ -6,46 +6,50 @@
using namespace Rcpp;
// parse_request
List parse_request(String req);
RcppExport SEXP _construe_parse_request(SEXP reqSEXP) {
List parse_request(String req, bool headers_lowercase);
RcppExport SEXP _construe_parse_request(SEXP reqSEXP, SEXP headers_lowercaseSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< String >::type req(reqSEXP);
rcpp_result_gen = Rcpp::wrap(parse_request(req));
Rcpp::traits::input_parameter< bool >::type headers_lowercase(headers_lowercaseSEXP);
rcpp_result_gen = Rcpp::wrap(parse_request(req, headers_lowercase));
return rcpp_result_gen;
END_RCPP
}
// parse_request_raw
List parse_request_raw(RawVector req);
RcppExport SEXP _construe_parse_request_raw(SEXP reqSEXP) {
List parse_request_raw(RawVector req, bool headers_lowercase);
RcppExport SEXP _construe_parse_request_raw(SEXP reqSEXP, SEXP headers_lowercaseSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< RawVector >::type req(reqSEXP);
rcpp_result_gen = Rcpp::wrap(parse_request_raw(req));
Rcpp::traits::input_parameter< bool >::type headers_lowercase(headers_lowercaseSEXP);
rcpp_result_gen = Rcpp::wrap(parse_request_raw(req, headers_lowercase));
return rcpp_result_gen;
END_RCPP
}
// parse_response
List parse_response(String resp);
RcppExport SEXP _construe_parse_response(SEXP respSEXP) {
List parse_response(String resp, bool headers_lowercase);
RcppExport SEXP _construe_parse_response(SEXP respSEXP, SEXP headers_lowercaseSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< String >::type resp(respSEXP);
rcpp_result_gen = Rcpp::wrap(parse_response(resp));
Rcpp::traits::input_parameter< bool >::type headers_lowercase(headers_lowercaseSEXP);
rcpp_result_gen = Rcpp::wrap(parse_response(resp, headers_lowercase));
return rcpp_result_gen;
END_RCPP
}
// parse_response_raw
List parse_response_raw(RawVector resp);
RcppExport SEXP _construe_parse_response_raw(SEXP respSEXP) {
List parse_response_raw(RawVector resp, bool headers_lowercase);
RcppExport SEXP _construe_parse_response_raw(SEXP respSEXP, SEXP headers_lowercaseSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< RawVector >::type resp(respSEXP);
rcpp_result_gen = Rcpp::wrap(parse_response_raw(resp));
Rcpp::traits::input_parameter< bool >::type headers_lowercase(headers_lowercaseSEXP);
rcpp_result_gen = Rcpp::wrap(parse_response_raw(resp, headers_lowercase));
return rcpp_result_gen;
END_RCPP
}
@ -61,24 +65,25 @@ BEGIN_RCPP
END_RCPP
}
// read_file_raw
RawVector read_file_raw(CharacterVector fil);
RcppExport SEXP _construe_read_file_raw(SEXP filSEXP) {
RawVector read_file_raw(CharacterVector fil, int buffer_size);
RcppExport SEXP _construe_read_file_raw(SEXP filSEXP, SEXP buffer_sizeSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type fil(filSEXP);
rcpp_result_gen = Rcpp::wrap(read_file_raw(fil));
Rcpp::traits::input_parameter< int >::type buffer_size(buffer_sizeSEXP);
rcpp_result_gen = Rcpp::wrap(read_file_raw(fil, buffer_size));
return rcpp_result_gen;
END_RCPP
}
static const R_CallMethodDef CallEntries[] = {
{"_construe_parse_request", (DL_FUNC) &_construe_parse_request, 1},
{"_construe_parse_request_raw", (DL_FUNC) &_construe_parse_request_raw, 1},
{"_construe_parse_response", (DL_FUNC) &_construe_parse_response, 1},
{"_construe_parse_response_raw", (DL_FUNC) &_construe_parse_response_raw, 1},
{"_construe_parse_request", (DL_FUNC) &_construe_parse_request, 2},
{"_construe_parse_request_raw", (DL_FUNC) &_construe_parse_request_raw, 2},
{"_construe_parse_response", (DL_FUNC) &_construe_parse_response, 2},
{"_construe_parse_response_raw", (DL_FUNC) &_construe_parse_response_raw, 2},
{"_construe_parse_url", (DL_FUNC) &_construe_parse_url, 1},
{"_construe_read_file_raw", (DL_FUNC) &_construe_read_file_raw, 1},
{"_construe_read_file_raw", (DL_FUNC) &_construe_read_file_raw, 2},
{NULL, NULL, 0}
};

75
src/code.cpp

@ -18,14 +18,34 @@
using namespace Rcpp;
using namespace httpparser;
std::string str_tolower(std::string str) {
std::transform(
str.begin(), str.end(), str.begin(),
[](unsigned char c){ return(std::tolower(c)); }
);
return(str);
}
//' Parse an HTTP request
//'
//' You can use the non- `_raw` version on input you know for sure has is plain text
//' You can use the non- `_raw` version on input you know for sure is plain text
//'
//' @param req HTTP request character string
//' @param headers_lowercase if `TRUE` (the default) names in the `headers` data frame
//' element are converted to lower case
//' @export
//' @examples
//' paste0(c(
//' "GET /uri.cgi HTTP/1.1\r\n",
//' "User-Agent: Mozilla/5.0\r\n",
//' "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n",
//' "Host: 127.0.0.1\r\n", "\r\n"
//' ), collapse = "") -> req
//'
//' res <- parse_request(req)
//' res <- parse_request_raw(charToRaw(req))
// [[Rcpp::export]]
List parse_request(String req) {
List parse_request(String req, bool headers_lowercase = true) {
List l;
@ -43,7 +63,7 @@ List parse_request(String req) {
R_xlen_t idx = 0;
for (std::vector<Request::HeaderItem>::const_iterator it = request.headers.begin(); it != request.headers.end(); ++it) {
names[idx] = it->name;
names[idx] = headers_lowercase ? str_tolower(it->name) : it->name;
vals[idx++] = it->value;
}
@ -77,7 +97,7 @@ List parse_request(String req) {
//' @rdname parse_request
//' @export
// [[Rcpp::export]]
List parse_request_raw(RawVector req) {
List parse_request_raw(RawVector req, bool headers_lowercase = true) {
List l;
@ -93,7 +113,7 @@ List parse_request_raw(RawVector req) {
R_xlen_t idx = 0;
for (std::vector<Request::HeaderItem>::const_iterator it = request.headers.begin(); it != request.headers.end(); ++it) {
names[idx] = it->name;
names[idx] = headers_lowercase ? str_tolower(it->name) : it->name;
vals[idx++] = it->value;
}
@ -126,12 +146,27 @@ List parse_request_raw(RawVector req) {
//' Parse an HTTP response
//'
//' You can use the non- `_raw` version on input you know for sure has is plain text
//' You can use the non- `_raw` version on input you know for sure is plain text
//'
//' @param resp HTTP response character string
//' @param headers_lowercase if `TRUE` (the default) names in the `headers` data frame
//' element are converted to lower case
//' @export
//' @examples
//' paste0(c(
//' "HTTP/1.1 200 OK\r\n",
//' "Server: nginx/1.2.1\r\n",
//' "Content-Type: text/html\r\n",
//' "Content-Length: 8\r\n",
//' "Connection: keep-alive\r\n",
//' "\r\n",
//' "<html />"
//' ), collapse = "") -> resp
//'
//' res <- parse_response(resp)
//' res <- parse_response_raw(charToRaw(resp))
// [[Rcpp::export]]
List parse_response(String resp) {
List parse_response(String resp, bool headers_lowercase = true) {
List l;
@ -149,7 +184,7 @@ List parse_response(String resp) {
R_xlen_t idx = 0;
for (std::vector<Response::HeaderItem>::const_iterator it = response.headers.begin(); it != response.headers.end(); ++it) {
names[idx] = it->name;
names[idx] = headers_lowercase ? str_tolower(it->name) : it->name;
vals[idx++] = it->value;
}
@ -183,7 +218,7 @@ List parse_response(String resp) {
//' @rdname parse_response
//' @export
// [[Rcpp::export]]
List parse_response_raw(RawVector resp) {
List parse_response_raw(RawVector resp, bool headers_lowercase = true) {
List l;
@ -199,7 +234,7 @@ List parse_response_raw(RawVector resp) {
R_xlen_t idx = 0;
for (std::vector<Response::HeaderItem>::const_iterator it = response.headers.begin(); it != response.headers.end(); ++it) {
names[idx] = it->name;
names[idx] = headers_lowercase ? str_tolower(it->name) : it->name;
vals[idx++] = it->value;
}
@ -234,6 +269,9 @@ List parse_response_raw(RawVector resp) {
//'
//' @param urls character vector of URLs
//' @export
//' @examples
//' URL <- "http://www.example.com/dir/subdir?param=1&param=2;param%20=%20#fragment"
//' parse_url(URL)
// [[Rcpp::export]]
DataFrame parse_url(std::vector < std::string > urls) {
@ -297,10 +335,17 @@ DataFrame parse_url(std::vector < std::string > urls) {
//' Read in a file, fast and raw
//'
//' @param fil file to read in (no path expansion is performed)a
//' @param fil file to read in (no path expansion is performed)
//' @param buffer_size larger buffer sizes may speed up reading of
//' very large files. It can also hurt performance, and this
//' function reads in the entire file into memory, so a
//' large buffer size also means more (temporary) memory will
//' be allocated.
//' @export
//' @examples
//' read_file_raw(system.file("extdat", "example.hdr", package = "construe"))
// [[Rcpp::export]]
RawVector read_file_raw(CharacterVector fil) {
RawVector read_file_raw(CharacterVector fil, int buffer_size = 16384) {
// #ifdef _WIN32
// wchar_t* buf;
@ -319,7 +364,11 @@ RawVector read_file_raw(CharacterVector fil) {
// std::ifstream in(fil[0], std::ios::in | std::ios::binary);
// #endif
std::ifstream in(fil[0], std::ios::in | std::ios::binary);
char buf[buffer_size];
std::ifstream in;
in.rdbuf()->pubsetbuf(buf, sizeof buf);
in.open(fil[0], std::ios::in | std::ios::binary);
if (in) {

67
src/httpresponseparser.h

@ -116,35 +116,32 @@ private:
}
break;
case ResponseHttpVersion_major:
if( input == '.' )
{
if( input == ' ' ) {
resp.versionMinor = 0;
state = ResponseHttpVersion_statusCodeStart;
} else if( input == '.' ) {
state = ResponseHttpVersion_minorStart;
}
else if( isDigit(input) )
{
} else if( isDigit(input) ) {
resp.versionMajor = resp.versionMajor * 10 + input - '0';
}
else
{
} else {
return ParsingError;
}
break;
case ResponseHttpVersion_minorStart:
if( isDigit(input) )
{
if( input == ' ' ) {
resp.versionMinor = 0;
state = ResponseHttpVersion_statusCodeStart;
} else if( isDigit(input) ) {
resp.versionMinor = input - '0';
state = ResponseHttpVersion_minor;
}
else
{
} else{
return ParsingError;
}
break;
case ResponseHttpVersion_minor:
if( input == ' ')
{
if( input == ' ') {
state = ResponseHttpVersion_statusCodeStart;
resp.statusCode = 0;
resp.versionMinor = 0;
}
else if( isDigit(input) )
{
@ -156,8 +153,11 @@ private:
}
break;
case ResponseHttpVersion_statusCodeStart:
// printf("ResponseHttpVersion_statusCodeStart\n\n");
if( isDigit(input) )
{
// printf(" - digit - ResponseHttpVersion_statusCodeStart\n\n");
resp.statusCode = input - '0';
state = ResponseHttpVersion_statusCode;
}
@ -167,34 +167,40 @@ private:
}
break;
case ResponseHttpVersion_statusCode:
// printf("ResponseHttpVersion_statusCode\n\n");
if( isDigit(input) )
{
// printf(" - digit - ResponseHttpVersion_statusCode\n\n");
resp.statusCode = resp.statusCode * 10 + input - '0';
}
else
{
if( resp.statusCode < 100 || resp.statusCode > 999 )
{
if( resp.statusCode < 100 || resp.statusCode > 999 ) {
return ParsingError;
}
else if( input == ' ' )
{
} else if( input == ' ' ) {
// printf(" - SPACE - ResponseHttpVersion_statusCode\n\n");
state = ResponseHttpVersion_statusTextStart;
}
else
{
} else if( input == '\r' ) {
// printf(" - CR - ResponseHttpVersion_statusCode\n\n");
resp.status = "";
state = ResponseHttpVersion_newLine;
} else {
return ParsingError;
}
}
break;
case ResponseHttpVersion_statusTextStart:
if( isChar(input) )
{
// printf("ResponseHttpVersion_statusTextStart\n\n");
if( input == '\r' ) {
// printf(" - CR - ResponseHttpVersion_statusTextStart\n\n");
resp.status = "";
state = ResponseHttpVersion_newLine;
} else if( isChar(input) ) {
// printf(" - char - ResponseHttpVersion_statusTextStart\n\n");
resp.status += input;
state = ResponseHttpVersion_statusText;
}
else
{
} else {
return ParsingError;
}
break;
@ -213,8 +219,10 @@ private:
}
break;
case ResponseHttpVersion_newLine:
// printf("ResponseHttpVersion_newLine\n\n");
if( input == '\n' )
{
// printf(" - NL - ResponseHttpVersion_newLine\n\n");
state = HeaderLineStart;
}
else
@ -223,6 +231,7 @@ private:
}
break;
case HeaderLineStart:
// printf("HeaderLineStart\n\n");
if( input == '\r' )
{
state = ExpectingNewline_3;

Loading…
Cancel
Save