Browse Source

switched to system psl lib

latest
boB Rudis 2 years ago
parent
commit
ba78397e42
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
9 changed files with 3 additions and 6484 deletions
  1. +1
    -1
      DESCRIPTION
  2. BIN
      README_files/figure-gfm/bench-1.png
  3. +1
    -1
      src/Makevars
  4. +0
    -147
      src/config.h
  5. +0
    -212
      src/libpsl.h
  6. +0
    -279
      src/lookup_string_in_fixed_set.c
  7. +1
    -3
      src/psl-main.cpp
  8. +0
    -1943
      src/psl.c
  9. +0
    -3898
      src/suffixes_dafsa.c

+ 1
- 1
DESCRIPTION View File

@@ -17,7 +17,7 @@ Description: The 'Public Suffix List' (<https://publicsuffix.org/>) is a collect
to extract internet domain components using the public suffix list base data.
URL: https://gitlab.com/hrbrmstr/psl
BugReports: https://gitlab.com/hrbrmstr/psl/issues
SystemRequirements: C++11
SystemRequirements: C++11; libpsl
Encoding: UTF-8
License: MIT + file LICENSE
Suggests:


BIN
README_files/figure-gfm/bench-1.png View File

Before After
Width: 1920  |  Height: 960  |  Size: 74KB Width: 1920  |  Height: 960  |  Size: 72KB

+ 1
- 1
src/Makevars View File

@@ -1,3 +1,3 @@
CXX_STD = CXX11
PKG_CXXFLAGS =
PKG_LIBS = -L. -liconv -lidn2
PKG_LIBS = -L. -lpsl

+ 0
- 147
src/config.h View File

@@ -1,147 +0,0 @@
/* config.h. Generated from config.h.in by configure. */
/* config.h.in. Generated from configure.ac by autoheader. */

/* generate PSL data using libicu */
/* #undef BUILTIN_GENERATOR_LIBICU */

/* generate PSL data using libidn */
/* #undef BUILTIN_GENERATOR_LIBIDN */

/* generate PSL data using libidn2 */
#define BUILTIN_GENERATOR_LIBIDN2 1

/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
systems. This function is required for `alloca.c' support on those systems.
*/
/* #undef CRAY_STACKSEG_END */

/* Define to 1 if using `alloca.c'. */
/* #undef C_ALLOCA */

/* Define to 1 if translation of program messages to the user's native
language is requested. */
/* #undef ENABLE_NLS */

/* Define to 1 if you have `alloca', as a function or macro. */
#define HAVE_ALLOCA 1

/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
*/
#define HAVE_ALLOCA_H 1

/* Define to 1 if you have the MacOS X function CFLocaleCopyCurrent in the
CoreFoundation framework. */
#define HAVE_CFLOCALECOPYCURRENT 1

/* Define to 1 if you have the MacOS X function CFPreferencesCopyAppValue in
the CoreFoundation framework. */
#define HAVE_CFPREFERENCESCOPYAPPVALUE 1

/* Define to 1 if you have the `clock_gettime' function. */
#define HAVE_CLOCK_GETTIME 1

/* Define if the GNU dcgettext() function is already present or preinstalled.
*/
/* #undef HAVE_DCGETTEXT */

/* Define to 1 if you have the <dlfcn.h> header file. */
#define HAVE_DLFCN_H 1

/* Define to 1 if you have the `fmemopen' function. */
#define HAVE_FMEMOPEN 1

/* Define if the GNU gettext() function is already present or preinstalled. */
/* #undef HAVE_GETTEXT */

/* Define if you have the iconv() function and it works. */
#define HAVE_ICONV 1

/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1

/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1

/* Define to 1 if you have the `nl_langinfo' function. */
#define HAVE_NL_LANGINFO 1

/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1

/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1

/* Define to 1 if you have the <strings.h> header file. */
#define HAVE_STRINGS_H 1

/* Define to 1 if you have the <string.h> header file. */
#define HAVE_STRING_H 1

/* Define to 1 if you have the `strndup' function. */
#define HAVE_STRNDUP 1

/* Define to 1 if you have the <sys/stat.h> header file. */
#define HAVE_SYS_STAT_H 1

/* Define to 1 if you have the <sys/types.h> header file. */
#define HAVE_SYS_TYPES_H 1

/* Define to 1 if you have the <unistd.h> header file. */
#define HAVE_UNISTD_H 1

/* Define to 1 or 0, depending whether the compiler supports simple visibility
declarations. */
#define HAVE_VISIBILITY 1

/* Define as const if the declaration of iconv() needs const. */
#define ICONV_CONST

/* Define to the sub-directory where libtool stores uninstalled libraries. */
#define LT_OBJDIR ".libs/"

/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT "tim.ruehsen@gmx.de"

/* Define to the full name of this package. */
#define PACKAGE_NAME "libpsl"

/* Define to the full name and version of this package. */
#define PACKAGE_STRING "libpsl 0.20.2"

/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "libpsl"

/* Define to the home page for this package. */
#define PACKAGE_URL "https://github.com/rockdaboot/libpsl"

/* Define to the version of this package. */
#define PACKAGE_VERSION "0.20.2"

/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be
automatically deduced at runtime.
STACK_DIRECTION > 0 => grows toward higher addresses
STACK_DIRECTION < 0 => grows toward lower addresses
STACK_DIRECTION = 0 => direction of growth unknown */
/* #undef STACK_DIRECTION */

/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1

/* generate PSL data using libicu */
/* #undef WITH_LIBICU */

/* generate PSL data using libidn */
/* #undef WITH_LIBIDN */

/* generate PSL data using libidn2 */
#define WITH_LIBIDN2 1

/* Define to `__inline__' or `__inline' if that's what the C compiler
calls it, or to nothing if 'inline' is not supported under any name. */
#ifndef __cplusplus
/* #undef inline */
#endif

/* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */

+ 0
- 212
src/libpsl.h View File

@@ -1,212 +0,0 @@
/*
* Copyright(c) 2014-2018 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* This file is part of libpsl.
*
* Header file for libpsl library routines
*
* Changelog
* 20.03.2014 Tim Ruehsen created
*
*/
#ifdef __cplusplus
extern "C" {
#endif

#ifndef LIBPSL_LIBPSL_H
#define LIBPSL_LIBPSL_H

#include <stdio.h>
#include <time.h>

#define PSL_VERSION "0.20.2"
#define PSL_VERSION_MAJOR 0
#define PSL_VERSION_MINOR 20
#define PSL_VERSION_PATCH 2
#define PSL_VERSION_NUMBER 0x001402

#ifndef PSL_API
#if defined BUILDING_PSL && HAVE_VISIBILITY
# define PSL_API __attribute__ ((__visibility__("default")))
#elif defined BUILDING_PSL && defined _MSC_VER && !defined PSL_STATIC
# define PSL_API __declspec(dllexport)
#elif defined _MSC_VER && !defined PSL_STATIC
# define PSL_API __declspec(dllimport)
#else
# define PSL_API
#endif
#endif

#ifdef __cplusplus
extern "C" {
#endif

/* types for psl_is_public_suffix2() */
#define PSL_TYPE_ICANN (1<<0)
#define PSL_TYPE_PRIVATE (1<<1)
#define PSL_TYPE_NO_STAR_RULE (1<<2)
#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE)

/**
* psl_error_t:
* @PSL_SUCCESS: Successful return.
* @PSL_ERR_INVALID_ARG: Invalid argument.
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
* @PSL_ERR_NO_MEM: Failed to allocate memory.
*
* Return codes for PSL functions.
* Negative return codes mean failure.
* Positive values are reserved for non-error return codes.
*/
typedef enum {
PSL_SUCCESS = 0,
PSL_ERR_INVALID_ARG = -1,
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
PSL_ERR_TO_UTF8 = -5, /* failed to convert utf-16 to utf-8 */
PSL_ERR_NO_MEM = -6 /* failed to allocate memory */
} psl_error_t;

typedef struct _psl_ctx_st psl_ctx_t;

/* frees PSL context */
PSL_API
void
psl_free(psl_ctx_t *psl);

/* frees memory allocated by libpsl routines */
PSL_API
void
psl_free_string(char *str);

/* loads PSL data from file */
PSL_API
psl_ctx_t *
psl_load_file(const char *fname);

/* loads PSL data from FILE pointer */
PSL_API
psl_ctx_t *
psl_load_fp(FILE *fp);

/* retrieves builtin PSL data */
PSL_API
const psl_ctx_t *
psl_builtin(void);

/* retrieves most recent PSL data */
PSL_API
psl_ctx_t *
psl_latest(const char *fname);

/* checks whether domain is a public suffix or not */
PSL_API
int
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain);

/* checks whether domain is a public suffix regarding the type or not */
PSL_API
int
psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type);

/* checks whether cookie_domain is acceptable for domain or not */
PSL_API
int
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain);

/* returns the longest not registrable domain within 'domain' or NULL if none found */
PSL_API
const char *
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);

/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
PSL_API
const char *
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);

/* convert a string into lowercase UTF-8 */
PSL_API
psl_error_t
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower);

/* does not include exceptions */
PSL_API
int
psl_suffix_count(const psl_ctx_t *psl);

/* just counts exceptions */
PSL_API
int
psl_suffix_exception_count(const psl_ctx_t *psl);

/* just counts wildcards */
PSL_API
int
psl_suffix_wildcard_count(const psl_ctx_t *psl);

/* returns mtime of PSL source file */
PSL_API
time_t
psl_builtin_file_time(void);

/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */
PSL_API
const char *
psl_builtin_sha1sum(void);

/* returns file name of PSL source file */
PSL_API
const char *
psl_builtin_filename(void);

/* returns name of distribution PSL data file */
PSL_API
const char *
psl_dist_filename(void);

/* returns library version string */
PSL_API
const char *
psl_get_version(void);

/* checks library version number */
PSL_API
int
psl_check_version_number(int version);

/* returns whether the built-in data is outdated or not */
PSL_API
int
psl_builtin_outdated(void);

#ifdef __cplusplus
}
#endif

#endif /* LIBPSL_LIBPSL_H */

#ifdef __cplusplus
}
#endif

+ 0
- 279
src/lookup_string_in_fixed_set.c View File

@@ -1,279 +0,0 @@
/* Copyright 2015-2016 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE.chromium file.
*
* Converted to C89 2015 by Tim Rühsen
*/

#include <stddef.h>

#if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define _GCC_VERSION_AT_LEAST(major, minor) 0
#endif

#define CHECK_LT(a, b) if ((a) >= b) return 0

static const char multibyte_length_table[16] = {
0, 0, 0, 0, /* 0x00-0x3F */
0, 0, 0, 0, /* 0x40-0x7F */
0, 0, 0, 0, /* 0x80-0xBF */
2, 2, 3, 4, /* 0xC0-0xFF */
};


/*
* Get length of multibyte character sequence starting at a given byte.
* Returns zero if the byte is not a valid leading byte in UTF-8.
*/
static int GetMultibyteLength(char c) {
return multibyte_length_table[((unsigned char)c) >> 4];
}

/*
* Moves pointers one byte forward.
*/
static void NextPos(const unsigned char** pos,
const char** key,
const char** multibyte_start)
{
++*pos;
if (*multibyte_start) {
/* Advance key to next byte in multibyte sequence. */
++*key;
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
*multibyte_start = 0;
} else {
if (GetMultibyteLength(**key)) {
/* Multibyte prefix was matched in the dafsa, start matching multibyte
* content in next round. */
*multibyte_start = *key;
} else {
/* Advance key as a single byte character was matched. */
++*key;
}
}
}

/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
*/

static int GetNextOffset(const unsigned char** pos,
const unsigned char* end,
const unsigned char** offset)
{
size_t bytes_consumed;

if (*pos == end)
return 0;

/* When reading an offset the byte array must always contain at least
* three more bytes to consume. First the offset to read, then a node
* to skip over and finally a destination node. No object can be smaller
* than one byte. */
CHECK_LT(*pos + 2, end);
switch (**pos & 0x60) {
case 0x60: /* Read three byte offset */
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: /* Read two byte offset */
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = end;
} else {
*pos += bytes_consumed;
}
return 1;
}

/*
* Check if byte at offset is last in label.
*/

static int IsEOL(const unsigned char* offset, const unsigned char* end)
{
CHECK_LT(offset, end);
return(*offset & 0x80) != 0;
}

/*
* Check if byte at offset matches first character in key.
* This version assumes a range check was already performed by the caller.
*/

static int IsMatchUnchecked(const unsigned char matcher,
const char* key,
const char* multibyte_start)
{
if (multibyte_start) {
/* Multibyte matching mode. */
if (multibyte_start == key) {
/* Match leading byte, which will also match the sequence length. */
return (matcher ^ 0x80) == (const unsigned char)*key;
} else {
/* Match following bytes. */
return (matcher ^ 0xC0) == (const unsigned char)*key;
}
}
/* If key points at a leading byte in a multibyte sequence, but we are not yet
* in multibyte mode, then the dafsa should contain a special byte to indicate
* a mode switch. */
if (GetMultibyteLength(*key)) {
return matcher == 0x1F;
}
/* Normal matching of a single byte character. */
return matcher == (const unsigned char)*key;
}

/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
*/

static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return IsMatchUnchecked(*offset, key, multibyte_start);
}

/*
* Check if byte at offset matches first character in key.
* This version matches characters last in label.
*/

static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
}

/*
* Read return value at offset.
* Returns true if a return value could be read, false otherwise.
*/

static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
const char* multibyte_start,
int* return_value)
{
CHECK_LT(offset, end);
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
return 0;
}

/*
* Looks up the string |key| with length |key_length| in a fixed set of
* strings. The set of strings must be known at compile time. It is converted to
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
* Automaton) by the script psl-make-dafsa during compilation. This permits
* efficient (in time and space) lookup. The graph generated by psl-make-dafsa
* takes the form of a constant byte array which should be supplied via the
* |graph| and |length| parameters. The return value is kDafsaNotFound,
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
*
* Lookup a domain key in a byte array generated by psl-make-dafsa.
*/

/* prototype to skip warning with -Wmissing-prototypes */
int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);

int LookupStringInFixedSet(const unsigned char* graph,
size_t length,
const char* key,
size_t key_length)
{
const unsigned char* pos = graph;
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
const char* multibyte_start = 0;

while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
* char <char>+ return value
* char end_char offsets
* char return value
* end_char offsets
* return_value
*/
int did_consume = 0;

if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key, multibyte_start))
continue;
did_consume = 1;
NextPos(&offset, &key, &multibyte_start);
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
* end_char offsets
* return value
*/

/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key, multibyte_start))
return -1;
NextPos(&offset, &key, &multibyte_start);
}
}
/* Possible matches at this point:
* end_char offsets
* return_value
* If one or more <char> elements were consumed, a failure
* to match is terminal. Otherwise, try the next node.
*/
if (key == key_end) {
int return_value;

if (GetReturnValue(offset, end, multibyte_start, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
*/
if (did_consume)
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
NextPos(&offset, &key, &multibyte_start);
pos = offset; /* Dive into child */
}

return -1; /* No match */
}

/* prototype to skip warning with -Wmissing-prototypes */
int GetUtfMode(const unsigned char *graph, size_t length);

int GetUtfMode(const unsigned char *graph, size_t length)
{
return length > 0 && graph[length - 1] < 0x80;
}

+ 1
- 3
src/psl-main.cpp View File

@@ -1,8 +1,6 @@
#include <Rcpp.h>

#include <regex>

#include "libpsl.h"
#include <libpsl.h>

using namespace Rcpp;



+ 0
- 1943
src/psl.c
File diff suppressed because it is too large
View File


+ 0
- 3898
src/suffixes_dafsa.c
File diff suppressed because it is too large
View File


Loading…
Cancel
Save