A small menubar app that allows you to switch between R versions quickly (if you have multiple versions of R framework installed). https://rud.is/rswitch
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

312 lines
11 KiB

5 years ago
//
// Tokeniser.swift
// SwiftSoup
//
// Created by Nabil Chatbi on 19/10/16.
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
//
import Foundation
final class Tokeniser {
static let replacementChar: UnicodeScalar = "\u{FFFD}" // replaces null character
private static let notCharRefCharsSorted: [UnicodeScalar] = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "<", UnicodeScalar.Ampersand].sorted()
private let reader: CharacterReader // html input
private let errors: ParseErrorList? // errors found while tokenising
private var state: TokeniserState = TokeniserState.Data // current tokenisation state
private var emitPending: Token? // the token we are about to emit on next read
private var isEmitPending: Bool = false
private var charsString: String? // characters pending an emit. Will fall to charsBuilder if more than one
private let charsBuilder: StringBuilder = StringBuilder(1024) // buffers characters to output as one token, if more than one emit per read
let dataBuffer: StringBuilder = StringBuilder(1024) // buffers data looking for </script>
var tagPending: Token.Tag = Token.Tag() // tag we are building up
let startPending: Token.StartTag = Token.StartTag()
let endPending: Token.EndTag = Token.EndTag()
let charPending: Token.Char = Token.Char()
let doctypePending: Token.Doctype = Token.Doctype() // doctype building up
let commentPending: Token.Comment = Token.Comment() // comment building up
private var lastStartTag: String? // the last start tag emitted, to test appropriate end tag
private var selfClosingFlagAcknowledged: Bool = true
init(_ reader: CharacterReader, _ errors: ParseErrorList?) {
self.reader = reader
self.errors = errors
}
func read()throws->Token {
if (!selfClosingFlagAcknowledged) {
error("Self closing flag not acknowledged")
selfClosingFlagAcknowledged = true
}
while (!isEmitPending) {
try state.read(self, reader)
}
// if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
if (charsBuilder.length > 0) {
let str: String = charsBuilder.toString()
charsBuilder.clear()
charsString = nil
return charPending.data(str)
} else if (charsString != nil) {
let token: Token = charPending.data(charsString!)
charsString = nil
return token
} else {
isEmitPending = false
return emitPending!
}
}
func emit(_ token: Token)throws {
try Validate.isFalse(val: isEmitPending, msg: "There is an unread token pending!")
emitPending = token
isEmitPending = true
if (token.type == Token.TokenType.StartTag) {
let startTag: Token.StartTag = token as! Token.StartTag
lastStartTag = startTag._tagName!
if (startTag._selfClosing) {
selfClosingFlagAcknowledged = false
}
} else if (token.type == Token.TokenType.EndTag) {
let endTag: Token.EndTag = token as! Token.EndTag
if (endTag._attributes.size() != 0) {
error("Attributes incorrectly present on end tag")
}
}
}
func emit(_ str: String ) {
// buffer strings up until last string token found, to emit only one token for a run of character refs etc.
// does not set isEmitPending; read checks that
if (charsString == nil) {
charsString = str
} else {
if (charsBuilder.length == 0) { // switching to string builder as more than one emit before read
charsBuilder.append(charsString!)
}
charsBuilder.append(str)
}
}
func emit(_ chars: [UnicodeScalar]) {
emit(String(chars.map {Character($0)}))
}
// func emit(_ codepoints: [Int]) {
// emit(String(codepoints, 0, codepoints.length));
// }
func emit(_ c: UnicodeScalar) {
emit(String(c))
}
func getState() -> TokeniserState {
return state
}
func transition(_ state: TokeniserState) {
self.state = state
}
func advanceTransition(_ state: TokeniserState) {
reader.advance()
self.state = state
}
func acknowledgeSelfClosingFlag() {
selfClosingFlagAcknowledged = true
}
private var codepointHolder: [UnicodeScalar] = [UnicodeScalar(0)!] // holder to not have to keep creating arrays
private var multipointHolder: [UnicodeScalar] = [UnicodeScalar(0)!, UnicodeScalar(0)!]
func consumeCharacterReference(_ additionalAllowedCharacter: UnicodeScalar?, _ inAttribute: Bool)throws->[UnicodeScalar]? {
if (reader.isEmpty()) {
return nil
}
if (additionalAllowedCharacter != nil && additionalAllowedCharacter == reader.current()) {
return nil
}
if (reader.matchesAnySorted(Tokeniser.notCharRefCharsSorted)) {
return nil
}
var codeRef: [UnicodeScalar] = codepointHolder
reader.markPos()
if (reader.matchConsume("#")) { // numbered
let isHexMode: Bool = reader.matchConsumeIgnoreCase("X")
let numRef: String = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence()
if (numRef.unicodeScalars.count == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals")
reader.rewindToMark()
return nil
}
if (!reader.matchConsume(";")) {
characterReferenceError("missing semicolon") // missing semi
}
var charval: Int = -1
let base: Int = isHexMode ? 16 : 10
if let num = Int(numRef, radix: base) {
charval = num
}
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range")
codeRef[0] = Tokeniser.replacementChar
return codeRef
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
codeRef[0] = UnicodeScalar(charval)!
return codeRef
}
} else { // named
// get as many letters as possible, and look for matching entities.
let nameRef: String = reader.consumeLetterThenDigitSequence()
let looksLegit: Bool = reader.matches(";")
// found if a base named entity without a ;, or an extended entity with the ;.
let found: Bool = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit))
if (!found) {
reader.rewindToMark()
if (looksLegit) { // named with semicolon
characterReferenceError("invalid named referenece '\(nameRef)'")
}
return nil
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny("=", "-", "_"))) {
// don't want that to match
reader.rewindToMark()
return nil
}
if (!reader.matchConsume(";")) {
characterReferenceError("missing semicolon") // missing semi
}
let numChars: Int = Entities.codepointsForName(nameRef, codepoints: &multipointHolder)
if (numChars == 1) {
codeRef[0] = multipointHolder[0]
return codeRef
} else if (numChars == 2) {
return multipointHolder
} else {
try Validate.fail(msg: "Unexpected characters returned for \(nameRef) num: \(numChars)")
return multipointHolder
}
}
}
@discardableResult
func createTagPending(_ start: Bool)->Token.Tag {
tagPending = start ? startPending.reset() : endPending.reset()
return tagPending
}
func emitTagPending()throws {
try tagPending.finaliseTag()
try emit(tagPending)
}
func createCommentPending() {
commentPending.reset()
}
func emitCommentPending()throws {
try emit(commentPending)
}
func createDoctypePending() {
doctypePending.reset()
}
func emitDoctypePending()throws {
try emit(doctypePending)
}
func createTempBuffer() {
Token.reset(dataBuffer)
}
func isAppropriateEndTagToken()throws->Bool {
if(lastStartTag != nil) {
let s = try tagPending.name()
return s.equalsIgnoreCase(string: lastStartTag!)
}
return false
}
func appropriateEndTagName() -> String? {
if (lastStartTag == nil) {
return nil
}
return lastStartTag
}
func error(_ state: TokeniserState) {
if (errors != nil && errors!.canAddError()) {
errors?.add(ParseError(reader.getPos(), "Unexpected character '\(String(reader.current()))' in input state [\(state.description)]"))
}
}
func eofError(_ state: TokeniserState) {
if (errors != nil && errors!.canAddError()) {
errors?.add(ParseError(reader.getPos(), "Unexpectedly reached end of file (EOF) in input state [\(state.description)]"))
}
}
private func characterReferenceError(_ message: String) {
if (errors != nil && errors!.canAddError()) {
errors?.add(ParseError(reader.getPos(), "Invalid character reference: \(message)"))
}
}
private func error(_ errorMsg: String) {
if (errors != nil && errors!.canAddError()) {
errors?.add(ParseError(reader.getPos(), errorMsg))
}
}
func currentNodeInHtmlNS() -> Bool {
// todo: implement namespaces correctly
return true
// Element currentNode = currentNode()
// return currentNode != null && currentNode.namespace().equals("HTML")
}
/**
* Utility method to consume reader and unescape entities found within.
* @param inAttribute
* @return unescaped string from reader
*/
func unescapeEntities(_ inAttribute: Bool)throws->String {
let builder: StringBuilder = StringBuilder()
while (!reader.isEmpty()) {
builder.append(reader.consumeTo(UnicodeScalar.Ampersand))
if (reader.matches(UnicodeScalar.Ampersand)) {
reader.consume()
if let c = try consumeCharacterReference(nil, inAttribute) {
if (c.count==0) {
builder.append(UnicodeScalar.Ampersand)
} else {
builder.appendCodePoint(c[0])
if (c.count == 2) {
builder.appendCodePoint(c[1])
}
}
} else {
builder.append(UnicodeScalar.Ampersand)
}
}
}
return builder.toString()
}
}