// Copyright (C) 2020-2024 Free Software Foundation, Inc. // This file is part of GCC. // GCC is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free // Software Foundation; either version 3, or (at your option) any later // version. // GCC is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // You should have received a copy of the GNU General Public License // along with GCC; see the file COPYING3. If not see // . #include "rust-codepoint.h" #include "rust-system.h" #include "rust-lex.h" #include "rust-diagnostics.h" #include "rust-linemap.h" #include "rust-session-manager.h" #include "safe-ctype.h" #include "cpplib.h" #include "rust-keyword-values.h" namespace Rust { // TODO: move to separate compilation unit? // overload += for uint32_t to allow 32-bit encoded utf-8 to be added std::string & operator+= (std::string &str, Codepoint char32) { if (char32.value < 0x80) { str += static_cast (char32.value); } else if (char32.value < (0x1F + 1) << (1 * 6)) { str += static_cast (0xC0 | ((char32.value >> 6) & 0x1F)); str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); } else if (char32.value < (0x0F + 1) << (2 * 6)) { str += static_cast (0xE0 | ((char32.value >> 12) & 0x0F)); str += static_cast (0x80 | ((char32.value >> 6) & 0x3F)); str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); } else if (char32.value < (0x07 + 1) << (3 * 6)) { str += static_cast (0xF0 | ((char32.value >> 18) & 0x07)); str += static_cast (0x80 | ((char32.value >> 12) & 0x3F)); str += static_cast (0x80 | ((char32.value >> 6) & 0x3F)); str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); } else { rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value); } return str; } std::string Codepoint::as_string () { std::string str; // str += Codepoint (value); str += *this; return str; } /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead * for handling. */ bool is_float_digit (uint32_t number) { return ISDIGIT (number) || number == 'E' || number == 'e'; } /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or * whatever is different */ bool is_x_digit (uint32_t number) { return ISXDIGIT (number); } bool is_octal_digit (uint32_t number) { return number >= '0' && number <= '7'; } bool is_bin_digit (uint32_t number) { return number == '0' || number == '1'; } bool check_valid_float_dot_end (uint32_t character) { return character != '.' && character != '_' && !ISALPHA (character); } bool is_whitespace (uint32_t character) { // https://doc.rust-lang.org/reference/whitespace.html return character == '\t' || character == '\n' || character == '\v' || character == '\f' || character == '\r' || character == ' ' || character == 0x0085 // next line || character == 0x200e // left-to-right mark || character == 0x200f // right-to-left mark || character == 0x2028 // line separator || character == 0x2029; // pragraph separator } bool is_non_decimal_int_literal_separator (uint32_t character) { return character == 'x' || character == 'o' || character == 'b'; } bool is_identifier_start (uint32_t codepoint) { return (cpp_check_xid_property (codepoint) & CPP_XID_START) || codepoint == '_'; } bool is_identifier_continue (uint32_t codepoint) { return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE; } Lexer::Lexer (const std::string &input, Linemap *linemap) : input (RAIIFile::create_error ()), current_line (1), current_column (1), line_map (linemap), dump_lex_out ({}), raw_input_source (new BufferInputSource (input, 0)), input_queue{*raw_input_source}, token_queue (TokenSource (this)) {} Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap, tl::optional dump_lex_opt) : input (std::move (file_input)), current_line (1), current_column (1), line_map (linemap), dump_lex_out (dump_lex_opt), raw_input_source (new FileInputSource (input.get_raw ())), input_queue{*raw_input_source}, token_queue (TokenSource (this)) { // inform line_table that file is being entered and is in line 1 if (linemap) line_map->start_file (filename, current_line); } Lexer::~Lexer () { /* ok apparently stop (which is equivalent of original code in destructor) is * meant to be called after all files have finished parsing, for cleanup. On * the other hand, actual code that it calls to leave a certain line map is * mentioned in GCC docs as being useful for "just leaving an included header" * and stuff like that, so this line mapping functionality may need fixing. * FIXME: find out whether this occurs. */ // line_map->stop(); } bool Lexer::input_source_is_valid_utf8 () { return raw_input_source->is_valid (); } location_t Lexer::get_current_location () { if (line_map) return linemap_position_for_column (line_table, current_column); else // If we have no linemap, we're lexing something without proper locations return UNDEF_LOCATION; } Codepoint Lexer::peek_input (int n) { return input_queue.peek (n); } Codepoint Lexer::peek_input () { return peek_input (0); } void Lexer::skip_input (int n) { input_queue.skip (n); } void Lexer::skip_input () { skip_input (0); } void Lexer::skip_token (int n) { // dump tokens if dump-lex option is enabled if (dump_lex_out.has_value ()) dump_and_skip (n); else token_queue.skip (n); } void Lexer::dump_and_skip (int n) { std::ofstream &out = dump_lex_out.value (); bool found_eof = false; const_TokenPtr tok; for (int i = 0; i < n + 1; i++) { if (!found_eof) { tok = peek_token (); found_eof |= tok->get_id () == Rust::END_OF_FILE; location_t loc = tok->get_locus (); out << "token_id_to_str (); out << (tok->has_str () ? (std::string (", text=") + tok->get_str () + std::string (", typehint=") + std::string (tok->get_type_hint_str ())) : "") << " "; out << Linemap::location_to_string (loc) << '\n'; } token_queue.skip (0); } } void Lexer::replace_current_token (TokenPtr replacement) { token_queue.replace_current_value (replacement); rust_debug ("called 'replace_current_token' - this is deprecated"); } /* Determines whether the string passed in is a keyword or not. If it is, it * returns the keyword name. */ TokenId Lexer::classify_keyword (const std::string &str) { auto &keywords = Rust::Values::Keywords::keywords_tokens; auto keyword = keywords.find (str); if (keyword == keywords.end ()) return IDENTIFIER; auto id = keyword->second; // We now have the expected token ID of the reserved keyword. However, some // keywords are reserved starting in certain editions. For example, `try` is // only a reserved keyword in editions >=2018. The language might gain new // reserved keywords in the future. // // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords // `try` is not a reserved keyword before 2018 if (Session::get_instance ().options.get_edition () == CompileOptions::Edition::E2015 && id == TRY) return IDENTIFIER; return id; } TokenPtr Lexer::build_token () { // loop to go through multiple characters to build a single token while (true) { location_t loc = get_current_location (); current_char = peek_input (); skip_input (); // detect shebang // Must be the first thing on the first line, starting with #! // But since an attribute can also start with an #! we don't count it as a // shebang line when after any whitespace or comments there is a [. If it // is a shebang line we simple drop the line. Otherwise we don't consume // any characters and fall through to the real tokenizer. if (current_line == 1 && current_column == 1 && current_char == '#' && peek_input () == '!') { int n = 1; while (true) { Codepoint next_char = peek_input (n); if (is_whitespace (next_char.value)) n++; else if ((next_char == '/' && peek_input (n + 1) == '/' && peek_input (n + 2) != '!' && peek_input (n + 2) != '/') || (next_char == '/' && peek_input (n + 1) == '/' && peek_input (n + 2) == '/' && peek_input (n + 3) == '/')) { // two // or four //// // A single line comment // (but not an inner or outer doc comment) n += 2; next_char = peek_input (n); while (next_char != '\n' && !next_char.is_eof ()) { n++; next_char = peek_input (n); } if (next_char == '\n') n++; } else if (next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) == '*' && peek_input (n + 3) == '/') { /**/ n += 4; } else if (next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) == '*' && peek_input (n + 3) == '*' && peek_input (n + 4) == '/') { /***/ n += 5; } else if ((next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) != '*' && peek_input (n + 2) != '!') || (next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) == '*' && peek_input (n + 3) == '*')) { // one /* or three /*** // Start of a block comment // (but not an inner or outer doc comment) n += 2; int level = 1; while (level > 0) { if (peek_input (n).is_eof ()) break; else if (peek_input (n) == '/' && peek_input (n + 1) == '*') { n += 2; level += 1; } else if (peek_input (n) == '*' && peek_input (n + 1) == '/') { n += 2; level -= 1; } else n++; } } else if (next_char != '[') { // definitely shebang, ignore the first line while (current_char != '\n' && !current_char.is_eof ()) { current_char = peek_input (); skip_input (); } // newline current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); break; } else break; /* Definitely not a shebang line. */ } } // return end of file token if end of file if (current_char.is_eof ()) return Token::make (END_OF_FILE, loc); // if not end of file, start tokenising switch (current_char.value) { /* ignore whitespace characters for tokens but continue updating * location */ case '\n': // newline case 0x0085: // next line case 0x2028: // line separator case 0x2029: // paragraph separator current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); continue; case '\r': // cr // Ignore, we expect a newline (lf) soon. continue; case ' ': // space current_column++; continue; case '\t': // horizontal tab // width of a tab is not well-defined, assume 8 spaces current_column += 8; continue; case '\v': // vertical tab case 0x000c: // form feed case 0x200e: // left-to-right mark case 0x200f: // right-to-left mark // Ignored. continue; // punctuation - actual tokens case '=': if (peek_input () == '>') { // match arm arrow skip_input (); current_column += 2; loc += 1; return Token::make (MATCH_ARROW, loc); } else if (peek_input () == '=') { // equality operator skip_input (); current_column += 2; loc += 1; return Token::make (EQUAL_EQUAL, loc); } else { // assignment operator current_column++; return Token::make (EQUAL, loc); } case '(': current_column++; return Token::make (LEFT_PAREN, loc); case '-': if (peek_input () == '>') { // return type specifier skip_input (); current_column += 2; loc += 1; return Token::make (RETURN_TYPE, loc); } else if (peek_input () == '=') { // minus-assign skip_input (); current_column += 2; loc += 1; return Token::make (MINUS_EQ, loc); } else { // minus current_column++; return Token::make (MINUS, loc); } case '+': if (peek_input () == '=') { // add-assign skip_input (); current_column += 2; loc += 1; return Token::make (PLUS_EQ, loc); } else { // add current_column++; return Token::make (PLUS, loc); } case ')': current_column++; return Token::make (RIGHT_PAREN, loc); case ';': current_column++; return Token::make (SEMICOLON, loc); case '*': if (peek_input () == '=') { // multiplication-assign skip_input (); current_column += 2; loc += 1; return Token::make (ASTERISK_EQ, loc); } else { // multiplication current_column++; return Token::make (ASTERISK, loc); } case ',': current_column++; return Token::make (COMMA, loc); case '/': if (peek_input () == '=') { // division-assign skip_input (); current_column += 2; loc += 1; return Token::make (DIV_EQ, loc); } else if ((peek_input () == '/' && peek_input (1) != '!' && peek_input (1) != '/') || (peek_input () == '/' && peek_input (1) == '/' && peek_input (2) == '/')) { // two // or four //// // single line comment // (but not an inner or outer doc comment) skip_input (); current_column += 2; current_char = peek_input (); // basically ignore until line finishes while (current_char != '\n' && !current_char.is_eof ()) { skip_input (); current_column++; // not used current_char = peek_input (); } continue; } else if (peek_input () == '/' && (peek_input (1) == '!' || peek_input (1) == '/')) { /* single line doc comment, inner or outer. */ bool is_inner = peek_input (1) == '!'; skip_input (1); current_column += 3; std::string str; str.reserve (32); current_char = peek_input (); while (current_char != '\n') { skip_input (); if (current_char == '\r') { Codepoint next_char = peek_input (); if (next_char == '\n') { current_char = '\n'; break; } rust_error_at ( loc, "Isolated CR %<\\r%> not allowed in doc comment"); current_char = next_char; continue; } if (current_char.is_eof ()) { rust_error_at ( loc, "unexpected EOF while looking for end of comment"); break; } str += current_char; current_char = peek_input (); } skip_input (); current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); str.shrink_to_fit (); loc += str.size () - 1; if (is_inner) return Token::make_inner_doc_comment (loc, std::move (str)); else return Token::make_outer_doc_comment (loc, std::move (str)); } else if (peek_input () == '*' && peek_input (1) == '*' && peek_input (2) == '/') { /**/ skip_input (2); current_column += 4; continue; } else if (peek_input () == '*' && peek_input (1) == '*' && peek_input (2) == '*' && peek_input (3) == '/') { /***/ skip_input (3); current_column += 5; continue; } else if ((peek_input () == '*' && peek_input (1) != '!' && peek_input (1) != '*') || (peek_input () == '*' && peek_input (1) == '*' && peek_input (2) == '*')) { // one /* or three /*** // block comment // (but not an inner or outer doc comment) skip_input (); current_column += 2; int level = 1; while (level > 0) { current_char = peek_input (); if (current_char.is_eof ()) { rust_error_at ( loc, "unexpected EOF while looking for end of comment"); break; } // if /* found if (current_char == '/' && peek_input (1) == '*') { // skip /* characters skip_input (1); current_column += 2; level += 1; continue; } // ignore until */ is found if (current_char == '*' && peek_input (1) == '/') { // skip */ characters skip_input (1); current_column += 2; level -= 1; continue; } if (current_char == '\n') { skip_input (); current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); continue; } skip_input (); current_column++; } // refresh new token continue; } else if (peek_input () == '*' && (peek_input (1) == '!' || peek_input (1) == '*')) { // block doc comment, inner /*! or outer /** bool is_inner = peek_input (1) == '!'; skip_input (1); current_column += 3; std::string str; str.reserve (96); int level = 1; while (level > 0) { current_char = peek_input (); if (current_char.is_eof ()) { rust_error_at ( loc, "unexpected EOF while looking for end of comment"); break; } // if /* found if (current_char == '/' && peek_input (1) == '*') { // skip /* characters skip_input (1); current_column += 2; level += 1; str += "/*"; continue; } // ignore until */ is found if (current_char == '*' && peek_input (1) == '/') { // skip */ characters skip_input (1); current_column += 2; level -= 1; if (level > 0) str += "*/"; continue; } if (current_char == '\r' && peek_input (1) != '\n') rust_error_at ( loc, "Isolated CR %<\\r%> not allowed in doc comment"); if (current_char == '\n') { skip_input (); current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); str += '\n'; continue; } str += current_char; skip_input (); current_column++; } str.shrink_to_fit (); loc += str.size () - 1; if (is_inner) return Token::make_inner_doc_comment (loc, std::move (str)); else return Token::make_outer_doc_comment (loc, std::move (str)); } else { // division current_column++; return Token::make (DIV, loc); } case '%': if (peek_input () == '=') { // modulo-assign skip_input (); current_column += 2; loc += 1; return Token::make (PERCENT_EQ, loc); } else { // modulo current_column++; return Token::make (PERCENT, loc); } case '^': if (peek_input () == '=') { // xor-assign? skip_input (); current_column += 2; loc += 1; return Token::make (CARET_EQ, loc); } else { // xor? current_column++; return Token::make (CARET, loc); } case '<': if (peek_input () == '<') { if (peek_input (1) == '=') { // left-shift assign skip_input (1); current_column += 3; loc += 2; return Token::make (LEFT_SHIFT_EQ, loc); } else { // left-shift skip_input (); current_column += 2; loc += 1; return Token::make (LEFT_SHIFT, loc); } } else if (peek_input () == '=') { // smaller than or equal to skip_input (); current_column += 2; loc += 1; return Token::make (LESS_OR_EQUAL, loc); } else { // smaller than current_column++; return Token::make (LEFT_ANGLE, loc); } break; case '>': if (peek_input () == '>') { if (peek_input (1) == '=') { // right-shift-assign skip_input (1); current_column += 3; loc += 2; return Token::make (RIGHT_SHIFT_EQ, loc); } else { // right-shift skip_input (); current_column += 2; loc += 1; return Token::make (RIGHT_SHIFT, loc); } } else if (peek_input () == '=') { // larger than or equal to skip_input (); current_column += 2; loc += 1; return Token::make (GREATER_OR_EQUAL, loc); } else { // larger than current_column++; return Token::make (RIGHT_ANGLE, loc); } case ':': if (peek_input () == ':') { // scope resolution :: skip_input (); current_column += 2; loc += 1; return Token::make (SCOPE_RESOLUTION, loc); } else { // single colon : current_column++; return Token::make (COLON, loc); } case '!': // no special handling for macros in lexer? if (peek_input () == '=') { // not equal boolean operator skip_input (); current_column += 2; loc += 1; return Token::make (NOT_EQUAL, loc); } else { // not equal unary operator current_column++; return Token::make (EXCLAM, loc); } case '?': current_column++; return Token::make (QUESTION_MARK, loc); case '#': current_column++; return Token::make (HASH, loc); case '[': current_column++; return Token::make (LEFT_SQUARE, loc); case ']': current_column++; return Token::make (RIGHT_SQUARE, loc); case '{': current_column++; return Token::make (LEFT_CURLY, loc); case '}': current_column++; return Token::make (RIGHT_CURLY, loc); case '@': current_column++; return Token::make (PATTERN_BIND, loc); case '$': current_column++; return Token::make (DOLLAR_SIGN, loc); case '~': current_column++; return Token::make (TILDE, loc); case '\\': current_column++; return Token::make (BACKSLASH, loc); case '`': current_column++; return Token::make (BACKTICK, loc); case '|': if (peek_input () == '=') { // bitwise or-assign? skip_input (); current_column += 2; loc += 1; return Token::make (PIPE_EQ, loc); } else if (peek_input () == '|') { // logical or skip_input (); current_column += 2; loc += 1; return Token::make (OR, loc); } else { // bitwise or current_column++; return Token::make (PIPE, loc); } case '&': if (peek_input () == '=') { // bitwise and-assign? skip_input (); current_column += 2; loc += 1; return Token::make (AMP_EQ, loc); } else if (peek_input () == '&') { // logical and skip_input (); current_column += 2; loc += 1; return Token::make (LOGICAL_AND, loc); } else { // bitwise and/reference current_column++; return Token::make (AMP, loc); } case '.': if (peek_input () == '.') { if (peek_input (1) == '.') { // ellipsis skip_input (1); current_column += 3; loc += 2; return Token::make (ELLIPSIS, loc); } else if (peek_input (1) == '=') { // ..= skip_input (1); current_column += 3; loc += 2; return Token::make (DOT_DOT_EQ, loc); } else { // .. skip_input (); current_column += 2; loc += 1; return Token::make (DOT_DOT, loc); } } else /*if (!ISDIGIT (peek_input ()))*/ { // single dot . // Only if followed by a non-number - otherwise is float // nope, float cannot start with '.'. current_column++; return Token::make (DOT, loc); } } // TODO: special handling of _ in the lexer? instead of being identifier // byte character, byte string and raw byte string literals if (current_char == 'b') { if (peek_input () == '\'') return parse_byte_char (loc); else if (peek_input () == '"') return parse_byte_string (loc); else if (peek_input () == 'r' && (peek_input (1) == '#' || peek_input (1) == '"')) return parse_raw_byte_string (loc); } // raw identifiers and raw strings if (current_char == 'r') { Codepoint peek = peek_input (); Codepoint peek1 = peek_input (1); // TODO (tamaron) parse Unicode ident if (peek == '#' && is_identifier_start (peek1.value)) { TokenPtr raw_ident_ptr = parse_raw_identifier (loc); if (raw_ident_ptr != nullptr) return raw_ident_ptr; else continue; /* input got parsed, it just wasn't valid. An error was produced. */ } else { TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc); if (maybe_raw_string_ptr != nullptr) return maybe_raw_string_ptr; } } // find identifiers and keywords. if (is_identifier_start (current_char.value)) return parse_identifier_or_keyword (loc); // int and float literals if (ISDIGIT (current_char.value)) { // _ not allowed as first char if (current_char == '0' && is_non_decimal_int_literal_separator (peek_input ().value)) { // handle binary, octal, hex literals TokenPtr non_dec_int_lit_ptr = parse_non_decimal_int_literals (loc); if (non_dec_int_lit_ptr != nullptr) return non_dec_int_lit_ptr; } else { // handle decimals (integer or float) TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc); if (decimal_or_float_ptr != nullptr) return decimal_or_float_ptr; } } // string literals if (current_char == '"') return parse_string (loc); // char literals and lifetime names if (current_char == '\'') { TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc); if (char_or_lifetime_ptr != nullptr) return char_or_lifetime_ptr; } // DEBUG: check for specific character problems: if (current_char == '0') rust_debug ("'0' uncaught before unexpected character"); else if (current_char == ']') rust_debug ("']' uncaught before unexpected character"); else if (current_char == 0x5d) rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before " "unexpected character"); // didn't match anything so error rust_error_at (loc, "unexpected character %<%x%>", current_char.value); current_column++; } } // Parses in a type suffix. std::pair Lexer::parse_in_type_suffix () { std::string suffix; suffix.reserve (5); int additional_length_offset = 0; // get suffix while (ISALPHA (current_char.value) || ISDIGIT (current_char.value) || current_char == '_') { if (current_char == '_') { // don't add _ to suffix skip_input (); current_char = peek_input (); additional_length_offset++; continue; } additional_length_offset++; suffix += current_char; skip_input (); current_char = peek_input (); } if (suffix.empty ()) { // no type suffix: do nothing but also no error return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); } else if (suffix == "f32") { return std::make_pair (CORETYPE_F32, additional_length_offset); } else if (suffix == "f64") { return std::make_pair (CORETYPE_F64, additional_length_offset); } else if (suffix == "i8") { return std::make_pair (CORETYPE_I8, additional_length_offset); } else if (suffix == "i16") { return std::make_pair (CORETYPE_I16, additional_length_offset); } else if (suffix == "i32") { return std::make_pair (CORETYPE_I32, additional_length_offset); } else if (suffix == "i64") { return std::make_pair (CORETYPE_I64, additional_length_offset); } else if (suffix == "i128") { return std::make_pair (CORETYPE_I128, additional_length_offset); } else if (suffix == "isize") { return std::make_pair (CORETYPE_ISIZE, additional_length_offset); } else if (suffix == "u8") { return std::make_pair (CORETYPE_U8, additional_length_offset); } else if (suffix == "u16") { return std::make_pair (CORETYPE_U16, additional_length_offset); } else if (suffix == "u32") { return std::make_pair (CORETYPE_U32, additional_length_offset); } else if (suffix == "u64") { return std::make_pair (CORETYPE_U64, additional_length_offset); } else if (suffix == "u128") { return std::make_pair (CORETYPE_U128, additional_length_offset); } else if (suffix == "usize") { return std::make_pair (CORETYPE_USIZE, additional_length_offset); } else { rust_error_at (get_current_location (), "unknown number suffix %qs", suffix.c_str ()); return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); } } // Parses in the exponent part (if any) of a float literal. std::pair Lexer::parse_in_exponent_part () { int additional_length_offset = 0; std::string str; if (current_char == 'E' || current_char == 'e') { // add exponent to string as strtod works with it str += current_char; skip_input (); current_char = peek_input (); additional_length_offset++; // special - and + handling if (current_char == '-') { str += '-'; skip_input (); current_char = peek_input (); additional_length_offset++; } else if (current_char == '+') { // don't add + but still skip input skip_input (); current_char = peek_input (); additional_length_offset++; } // parse another decimal number for exponent auto str_length = parse_in_decimal (); str += std::get<0> (str_length); additional_length_offset += std::get<1> (str_length); } return std::make_pair (str, additional_length_offset); } // Parses a decimal integer. std::tuple Lexer::parse_in_decimal () { /* A pure decimal contains only digits. */ bool pure_decimal = true; int additional_length_offset = 0; std::string str; while (ISDIGIT (current_char.value) || current_char.value == '_') { if (current_char == '_') { pure_decimal = false; // don't add _ to number skip_input (); current_char = peek_input (); additional_length_offset++; continue; } additional_length_offset++; str += current_char; skip_input (); current_char = peek_input (); } return std::make_tuple (str, additional_length_offset, pure_decimal); } /* Parses escapes (and string continues) in "byte" strings and characters. Does * not support unicode. */ std::tuple Lexer::parse_escape (char opening_char) { int additional_length_offset = 0; char output_char = 0; // skip to actual letter skip_input (); current_char = peek_input (); additional_length_offset++; switch (current_char.value) { case 'x': { auto hex_escape_pair = parse_partial_hex_escape (); long hexLong = hex_escape_pair.first; additional_length_offset += hex_escape_pair.second; if (hexLong > 255 || hexLong < 0) rust_error_at ( get_current_location (), "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>", static_cast (hexLong)); /* TODO: restore capital for escape output - gcc pretty-printer doesn't * support %X directly */ char hexChar = static_cast (hexLong); output_char = hexChar; } break; case 'n': output_char = '\n'; break; case 'r': output_char = '\r'; break; case 't': output_char = '\t'; break; case '\\': output_char = '\\'; break; case '0': output_char = '\0'; break; case '\'': output_char = '\''; break; case '"': output_char = '"'; break; case 'u': rust_error_at (get_current_location (), "cannot have a unicode escape \\u in a byte %s", opening_char == '\'' ? "character" : "string"); // Try to parse it anyway, just to skip it parse_partial_unicode_escape (); return std::make_tuple (output_char, additional_length_offset, false); case '\r': case '\n': // string continue return std::make_tuple (0, parse_partial_string_continue (), true); default: rust_error_at (get_current_location (), "unknown escape sequence %<\\%s%>", current_char.as_string ().c_str ()); // returns false if no parsing could be done // return false; return std::make_tuple (output_char, additional_length_offset, false); break; } // all non-special cases (string continue) should skip their used char skip_input (); current_char = peek_input (); additional_length_offset++; // returns true if parsing was successful // return true; return std::make_tuple (output_char, additional_length_offset, false); } /* Parses an escape (or string continue) in a string or character. Supports * unicode escapes. */ std::tuple Lexer::parse_utf8_escape () { Codepoint output_char; int additional_length_offset = 0; // skip to actual letter skip_input (); current_char = peek_input (); additional_length_offset++; switch (current_char.value) { case 'x': { auto hex_escape_pair = parse_partial_hex_escape (); long hexLong = hex_escape_pair.first; additional_length_offset += hex_escape_pair.second; if (hexLong > 127 || hexLong < 0) rust_error_at ( get_current_location (), "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>", static_cast (hexLong)); /* TODO: restore capital for escape output - gcc pretty-printer doesn't * support %X directly */ char hexChar = static_cast (hexLong); output_char = hexChar; } break; case 'n': output_char = '\n'; break; case 'r': output_char = '\r'; break; case 't': output_char = '\t'; break; case '\\': output_char = '\\'; break; case '0': output_char = '\0'; break; case '\'': output_char = '\''; break; case '"': output_char = '"'; break; case 'u': { auto unicode_escape_pair = parse_partial_unicode_escape (); output_char = unicode_escape_pair.first; additional_length_offset += unicode_escape_pair.second; return std::make_tuple (output_char, additional_length_offset, false); } break; case '\r': case '\n': // string continue return std::make_tuple (0, parse_partial_string_continue (), true); default: rust_error_at (get_current_location (), "unknown escape sequence %<\\%s%>", current_char.as_string ().c_str ()); // returns false if no parsing could be done // return false; return std::make_tuple (output_char, additional_length_offset, false); break; } /* all non-special cases (unicode, string continue) should skip their used * char */ skip_input (); current_char = peek_input (); additional_length_offset++; // returns true if parsing was successful // return true; return std::make_tuple (output_char, additional_length_offset, false); } // Parses the body of a string continue that has been found in an escape. int Lexer::parse_partial_string_continue () { int additional_length_offset = 1; // string continue // TODO use utf-8 codepoint to skip whitespaces while (is_whitespace (current_char.value)) { if (current_char == '\n') { current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); // reset "length" additional_length_offset = 1; // get next char skip_input (); current_char = peek_input (); continue; } skip_input (); current_char = peek_input (); additional_length_offset++; } return additional_length_offset; } /* Parses the body of a '\x' escape. Note that it does not check that the number * is valid and smaller than 255. */ std::pair Lexer::parse_partial_hex_escape () { // hex char string (null-terminated) char hexNum[3] = {0, 0, 0}; // first hex char current_char = peek_input (1); int additional_length_offset = 1; if (!is_x_digit (current_char.value)) { rust_error_at (get_current_location (), "invalid character %<\\x%s%> in \\x sequence", current_char.as_string ().c_str ()); return std::make_pair (0, 0); } hexNum[0] = current_char.value; // second hex char skip_input (); current_char = peek_input (1); additional_length_offset++; if (!is_x_digit (current_char.value)) { rust_error_at (get_current_location (), "invalid character %<\\x%c%s%> in \\x sequence", hexNum[0], current_char.as_string ().c_str ()); return std::make_pair (0, 1); } skip_input (); hexNum[1] = current_char.value; long hexLong = std::strtol (hexNum, nullptr, 16); return std::make_pair (hexLong, additional_length_offset); } // Parses the body of a unicode escape. std::pair Lexer::parse_partial_unicode_escape () { skip_input (); current_char = peek_input (); int additional_length_offset = 0; if (current_char != '{') { rust_error_at (get_current_location (), "unicode escape should start with %<{%>"); /* Skip what should probaby have been between brackets. */ while (is_x_digit (current_char.value) || current_char == '_') { skip_input (); current_char = peek_input (); additional_length_offset++; } return std::make_pair (Codepoint (0), additional_length_offset); } skip_input (); current_char = peek_input (); additional_length_offset++; if (current_char == '_') { rust_error_at (get_current_location (), "unicode escape cannot start with %<_%>"); skip_input (); current_char = peek_input (); additional_length_offset++; // fallthrough and try to parse the rest anyway } // parse unicode escape - 1-6 hex digits std::string num_str; num_str.reserve (6); // loop through to add entire hex number to string while (is_x_digit (current_char.value) || current_char.value == '_') { if (current_char == '_') { // don't add _ to number skip_input (); current_char = peek_input (); additional_length_offset++; continue; } additional_length_offset++; // add raw hex numbers num_str += current_char; skip_input (); current_char = peek_input (); } if (current_char == '}') { skip_input (); current_char = peek_input (); additional_length_offset++; } else { // actually an error, but allow propagation anyway Assume that // wrong bracketm whitespace or single/double quotes are wrong // termination, otherwise it is a wrong character, then skip to the actual // terminator. // TODO use utf-8 codepoint to skip whitespaces if (current_char == '{' || is_whitespace (current_char.value) || current_char == '\'' || current_char == '"') { rust_error_at (get_current_location (), "expected terminating %<}%> in unicode escape"); return std::make_pair (Codepoint (0), additional_length_offset); } else { rust_error_at (get_current_location (), "invalid character %<%s%> in unicode escape", current_char.as_string ().c_str ()); // TODO use utf-8 codepoint to skip whitespaces while (current_char != '}' && current_char != '{' && !is_whitespace (current_char.value) && current_char != '\'' && current_char != '"') { skip_input (); current_char = peek_input (); additional_length_offset++; } // Consume the actual closing bracket if found if (current_char == '}') { skip_input (); current_char = peek_input (); additional_length_offset++; } return std::make_pair (Codepoint (0), additional_length_offset); } } // ensure 1-6 hex characters if (num_str.length () > 6 || num_str.length () < 1) { rust_error_at (get_current_location (), "unicode escape should be between 1 and 6 hex " "characters; it is %lu", (unsigned long) num_str.length ()); // return false; return std::make_pair (Codepoint (0), additional_length_offset); } unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16); if (hex_num > 0xd7ff && hex_num < 0xe000) { rust_error_at ( get_current_location (), "unicode escape cannot be a surrogate value (D800 to DFFF)"); return std::make_pair (Codepoint (0), additional_length_offset); } if (hex_num > 0x10ffff) { rust_error_at (get_current_location (), "unicode escape cannot be larger than 10FFFF"); return std::make_pair (Codepoint (0), additional_length_offset); } // return true; return std::make_pair (Codepoint (static_cast (hex_num)), additional_length_offset); } // Parses a byte character. TokenPtr Lexer::parse_byte_char (location_t loc) { skip_input (); current_column++; // make current char the next character current_char = peek_input (); int length = 1; // char to save Codepoint byte_char = 0; // detect escapes if (current_char == '\\') { auto escape_length_pair = parse_escape ('\''); byte_char = std::get<0> (escape_length_pair); length += std::get<1> (escape_length_pair); current_char = peek_input (); if (current_char != '\'') { rust_error_at (get_current_location (), "unclosed %"); } skip_input (); current_char = peek_input (); length++; // go to next char } else if (current_char != '\'') { // otherwise, get character from direct input character byte_char = current_char; if (!byte_char.is_ascii ()) { rust_error_at (get_current_location (), "non-ASCII character in %"); } skip_input (); current_char = peek_input (); length++; if (current_char != '\'') { rust_error_at (get_current_location (), "unclosed %"); } skip_input (); current_char = peek_input (); length++; // go to next char } else { rust_error_at (get_current_location (), "no character inside %<%> for %"); } current_column += length; loc += length - 1; return Token::make_byte_char (loc, byte_char.value); } // Parses a byte string. TokenPtr Lexer::parse_byte_string (location_t loc) { // byte string // skip quote character skip_input (); current_column++; std::string str; str.reserve (16); // some sensible default current_char = peek_input (); const location_t string_begin_locus = get_current_location (); while (current_char != '"' && !current_char.is_eof ()) { if (current_char == '\\') { int length = 1; auto escape_length_pair = parse_escape ('"'); char output_char = std::get<0> (escape_length_pair); if (output_char == 0 && std::get<2> (escape_length_pair)) length = std::get<1> (escape_length_pair) - 1; else length += std::get<1> (escape_length_pair); if (output_char != 0 || !std::get<2> (escape_length_pair)) str += output_char; current_column += length; continue; } current_column++; if (current_char.value == '\n') { current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); } str += current_char; skip_input (); current_char = peek_input (); } if (current_char == '"') { current_column++; skip_input (); current_char = peek_input (); } else if (current_char.is_eof ()) { rust_error_at (string_begin_locus, "unended byte string literal"); return Token::make (END_OF_FILE, get_current_location ()); } else { rust_unreachable (); } str.shrink_to_fit (); loc += str.size () - 1; return Token::make_byte_string (loc, std::move (str)); } // Parses a raw byte string. TokenPtr Lexer::parse_raw_byte_string (location_t loc) { // raw byte string literals std::string str; str.reserve (16); // some sensible default int length = 1; int hash_count = 0; // get hash count at beginnning skip_input (); current_char = peek_input (); length++; while (current_char == '#') { hash_count++; length++; skip_input (); current_char = peek_input (); } if (current_char != '"') { rust_error_at (get_current_location (), "raw byte string has no opening %<\"%>"); } skip_input (); current_char = peek_input (); length++; while (true) { if (current_char == '"') { bool enough_hashes = true; for (int i = 0; i < hash_count; i++) { if (peek_input (i + 1) != '#') { enough_hashes = false; break; } } if (enough_hashes) { // skip enough input and peek enough input skip_input (hash_count); current_char = peek_input (); length += hash_count + 1; break; } } if (current_char.value > 127) { rust_error_at (get_current_location (), "character %<%s%> in raw byte string out of range", current_char.as_string ().c_str ()); current_char = 0; } length++; str += current_char; skip_input (); current_char = peek_input (); } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_byte_string (loc, std::move (str)); } // Parses a raw identifier. TokenPtr Lexer::parse_raw_identifier (location_t loc) { // raw identifier std::string str; str.reserve (16); // default skip_input (); current_char = peek_input (); current_column += 2; bool first_is_underscore = current_char == '_'; int length = 0; current_char = peek_input (); // loop through entire name while (is_identifier_continue (current_char.value)) { length++; str += current_char; skip_input (); current_char = peek_input (); } current_column += length; rust_debug ("raw ident: %s", str.c_str ()); // if just a single underscore, not an identifier if (first_is_underscore && length == 1) rust_error_at (get_current_location (), "%<_%> is not a valid raw identifier"); using namespace Rust::Values; std::set invalid{ Keywords::CRATE, Keywords::EXTERN_KW, Keywords::SELF, Keywords::SUPER, Keywords::SELF_ALIAS, }; if (invalid.find (str) != invalid.end ()) { rust_error_at (get_current_location (), "%qs is a forbidden raw identifier", str.c_str ()); return nullptr; } else { str.shrink_to_fit (); loc += length - 1; return Token::make_identifier (loc, std::move (str)); } } // skip broken string input (unterminated strings) void Lexer::skip_broken_string_input (Codepoint current_char) { while (current_char != '"' && !current_char.is_eof ()) { if (current_char == '\n') { current_line++; current_column = 1; } else { current_column++; } skip_input (); current_char = peek_input (); } if (current_char == '"') { current_column++; skip_input (); current_char = peek_input (); } rust_debug ("skipped to %d:%d due to bad quotes", current_line, current_column); } // Parses a string. TokenPtr Lexer::parse_string (location_t loc) { std::string str; str.reserve (16); // some sensible default current_char = peek_input (); const location_t string_begin_locus = get_current_location (); // FIXME: This fails if the input ends. How do we check for EOF? while (current_char.value != '"' && !current_char.is_eof ()) { if (current_char.value == '\\') { int length = 1; // parse escape auto utf8_escape_pair = parse_utf8_escape (); current_char = std::get<0> (utf8_escape_pair); if (current_char == Codepoint (0) && std::get<2> (utf8_escape_pair)) length = std::get<1> (utf8_escape_pair) - 1; else length += std::get<1> (utf8_escape_pair); if (current_char != Codepoint (0) || !std::get<2> (utf8_escape_pair)) str += current_char.as_string (); current_column += length; // FIXME: should remove this but can't. // `parse_utf8_escape` does not update `current_char` correctly. current_char = peek_input (); continue; } current_column++; if (current_char.value == '\n') { current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); } str += current_char; skip_input (); current_char = peek_input (); } if (current_char.value == '"') { current_column++; skip_input (); current_char = peek_input (); } else if (current_char.is_eof ()) { rust_error_at (string_begin_locus, "unended string literal"); return Token::make (END_OF_FILE, get_current_location ()); } else { rust_unreachable (); } str.shrink_to_fit (); return Token::make_string (loc, std::move (str)); } // Parses an identifier or keyword. TokenPtr Lexer::parse_identifier_or_keyword (location_t loc) { std::string str; str.reserve (16); // default str += current_char.as_string (); bool first_is_underscore = current_char == '_'; int length = 1; current_char = peek_input (); // loop through entire name while (is_identifier_continue (current_char.value)) { auto s = current_char.as_string (); length++; str += current_char.as_string (); skip_input (); current_char = peek_input (); } current_column += length; // if just a single underscore, not an identifier if (first_is_underscore && length == 1) return Token::make (UNDERSCORE, loc); str.shrink_to_fit (); loc += length - 1; TokenId keyword = classify_keyword (str); if (keyword == IDENTIFIER) return Token::make_identifier (loc, std::move (str)); else return Token::make (keyword, loc); } // Possibly returns a raw string token if it exists - otherwise returns null. TokenPtr Lexer::maybe_parse_raw_string (location_t loc) { int peek_index = 0; while (peek_input (peek_index) == '#') peek_index++; if (peek_input (peek_index) == '"') return parse_raw_string (loc, peek_index); else return nullptr; } // Returns a raw string token. TokenPtr Lexer::parse_raw_string (location_t loc, int initial_hash_count) { // raw string literals std::string str; str.reserve (16); // some sensible default int length = 1 + initial_hash_count; if (initial_hash_count > 0) skip_input (initial_hash_count - 1); current_char = peek_input (); if (current_char != '"') rust_error_at (get_current_location (), "raw string has no opening %<\"%>"); length++; skip_input (); current_char = peek_input (); while (!current_char.is_eof ()) { if (current_char.value == '"') { bool enough_hashes = true; for (int i = 0; i < initial_hash_count; i++) { if (peek_input (i + 1) != '#') { enough_hashes = false; break; } } if (enough_hashes) { // skip enough input and peek enough input skip_input (initial_hash_count); current_char = peek_input (); length += initial_hash_count + 1; break; } } length++; str += current_char.as_string (); skip_input (); current_char = peek_input (); } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_string (loc, std::move (str)); } template TokenPtr Lexer::parse_non_decimal_int_literal (location_t loc, IsDigitFunc is_digit_func, std::string existent_str, int base) { int length = 1; skip_input (); current_char = peek_input (); length++; // loop through to add entire number to string while (is_digit_func (current_char.value) || current_char == '_') { if (current_char == '_') { // don't add _ to number skip_input (); current_char = peek_input (); length++; continue; } length++; // add raw numbers existent_str += current_char; skip_input (); current_char = peek_input (); } // convert value to decimal representation long dec_num = std::strtol (existent_str.c_str (), nullptr, base); existent_str = std::to_string (dec_num); // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; length += type_suffix_pair.second; current_column += length; if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) { rust_error_at (get_current_location (), "invalid type suffix %qs for integer (%s) literal", get_type_hint_string (type_hint), base == 16 ? "hex" : (base == 8 ? "octal" : (base == 2 ? "binary" : ""))); return nullptr; } loc += length - 1; return Token::make_int (loc, std::move (existent_str), type_hint); } // Parses a hex, binary or octal int literal. TokenPtr Lexer::parse_non_decimal_int_literals (location_t loc) { std::string str; str.reserve (16); // some sensible default str += current_char; current_char = peek_input (); if (current_char == 'x') { // hex (integer only) return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16); } else if (current_char == 'o') { // octal (integer only) return parse_non_decimal_int_literal (loc, is_octal_digit, std::move (str), 8); } else if (current_char == 'b') { // binary (integer only) return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str), 2); } else { return nullptr; } } // Parses a decimal-based int literal or float literal. TokenPtr Lexer::parse_decimal_int_or_float (location_t loc) { std::string str; str.reserve (16); // some sensible default str += current_char; int length = 1; bool first_zero = current_char == '0'; current_char = peek_input (); // parse initial decimal integer (or first integer part of float) literal auto initial_decimal = parse_in_decimal (); str += std::get<0> (initial_decimal); length += std::get<1> (initial_decimal); // detect float literal // // Note: // // We should not use is_float_digit () for this verification but instead // directly ISDIGIT because rust does not support non digit values right after // a dot. // The following value is not legal in rust: // let a = 3.e1; // A `0` should be put between the dot and the exponent to be valid // (eg. 3.0e1). if (current_char == '.' && ISDIGIT (peek_input (1).value)) { // float with a '.', parse another decimal into it // add . to str str += current_char; skip_input (); current_char = peek_input (); length++; // parse another decimal number for float auto second_decimal = parse_in_decimal (); str += std::get<0> (second_decimal); length += std::get<1> (second_decimal); // parse in exponent part if it exists auto exponent_pair = parse_in_exponent_part (); str += exponent_pair.first; length += exponent_pair.second; // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; length += type_suffix_pair.second; if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 && type_hint != CORETYPE_UNKNOWN) { rust_error_at (get_current_location (), "invalid type suffix %qs for floating-point literal", get_type_hint_string (type_hint)); // ignore invalid type suffix as everything else seems fine type_hint = CORETYPE_UNKNOWN; } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_float (loc, std::move (str), type_hint); } else if (current_char == '.' && check_valid_float_dot_end (peek_input (1).value)) { // float that is just an integer with a terminating '.' character // add . to str str += current_char; skip_input (); current_char = peek_input (); length++; // type hint not allowed current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN); } else if (current_char == 'E' || current_char == 'e') { // exponent float with no '.' character // parse exponent part auto exponent_pair = parse_in_exponent_part (); str += exponent_pair.first; length += exponent_pair.second; // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; length += type_suffix_pair.second; if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 && type_hint != CORETYPE_UNKNOWN) { rust_error_at (get_current_location (), "invalid type suffix %qs for floating-point literal", get_type_hint_string (type_hint)); // ignore invalid type suffix as everything else seems fine type_hint = CORETYPE_UNKNOWN; } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_float (loc, std::move (str), type_hint); } else { // is an integer // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; /* A "real" pure decimal doesn't have a suffix and no zero prefix. */ if (type_hint == CORETYPE_UNKNOWN) { bool pure_decimal = std::get<2> (initial_decimal); if (pure_decimal && (!first_zero || str.size () == 1)) type_hint = CORETYPE_PURE_DECIMAL; } length += type_suffix_pair.second; current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_int (loc, std::move (str), type_hint); } } TokenPtr Lexer::parse_char_or_lifetime (location_t loc) { int length = 1; current_char = peek_input (); if (current_char.is_eof ()) return nullptr; // parse escaped char literal if (current_char.value == '\\') { // parse escape auto utf8_escape_pair = parse_utf8_escape (); Codepoint escaped_char = std::get<0> (utf8_escape_pair); length += std::get<1> (utf8_escape_pair); if (peek_input ().value != '\'') { rust_error_at (get_current_location (), "unended character literal"); } else { skip_input (); current_char = peek_input (); length++; } current_column += length; loc += length - 1; return Token::make_char (loc, escaped_char); } else { skip_input (); if (peek_input ().value == '\'') { // parse non-escaped char literal Codepoint non_escaped_char = current_char; // skip the ' character skip_input (); current_char = peek_input (); // TODO fix due to different widths of utf-8 chars? current_column += 3; loc += 2; return Token::make_char (loc, non_escaped_char); } else if (is_identifier_start (current_char.value)) { // parse lifetime name std::string str; str += current_char.as_string (); length++; current_char = peek_input (); while (is_identifier_continue (current_char.value)) { str += current_char.as_string (); skip_input (); current_char = peek_input (); length++; } current_column += length; loc += length - 1; // TODO some keywords cannot be used for a lifetime label #2306 // https://doc.rust-lang.org/reference/tokens.html str.shrink_to_fit (); return Token::make_lifetime (loc, std::move (str)); } else { rust_error_at ( get_current_location (), "expected %' after character constant in character literal"); return nullptr; } } } void Lexer::split_current_token (TokenId new_left, TokenId new_right) { /* TODO: assert that this TokenId is a "simple token" like punctuation and not * like "IDENTIFIER"? */ location_t current_loc = peek_token ()->get_locus (); TokenPtr new_left_tok = Token::make (new_left, current_loc); TokenPtr new_right_tok = Token::make (new_right, current_loc + 1); token_queue.replace_current_value (std::move (new_left_tok)); token_queue.insert (1, std::move (new_right_tok)); } void Lexer::split_current_token (std::vector new_tokens) { rust_assert (new_tokens.size () > 0); token_queue.replace_current_value (new_tokens[0]); for (size_t i = 1; i < new_tokens.size (); i++) { token_queue.insert (i, new_tokens[i]); } } void Lexer::start_line (int current_line, int current_column) { if (line_map) linemap_line_start (line_table, current_line, current_column); } } // namespace Rust #if CHECKING_P namespace selftest { // Checks if `src` has the same contents as the given characters static void assert_source_content (Rust::InputSource &src, const std::vector &expected) { Rust::Codepoint src_char = src.next (); for (auto expected_char : expected) { // Make sure that `src` is not shorter than `expected` ASSERT_FALSE (src_char.is_eof ()); // Checks skipped character is expeceted one. ASSERT_EQ (src_char.value, expected_char); src_char = src.next (); } // Checks if `src` and `chars` has the same length. ASSERT_TRUE (src_char.is_eof ()); } static void test_buffer_input_source (std::string str, const std::vector &expected) { Rust::BufferInputSource source (str, 0); assert_source_content (source, expected); } static void test_file_input_source (std::string str, const std::vector &expected) { FILE *tmpf = tmpfile (); // Moves to the first character fputs (str.c_str (), tmpf); std::rewind (tmpf); Rust::FileInputSource source (tmpf); assert_source_content (source, expected); } void rust_input_source_test () { // ASCII std::string src = u8"_abcde\tXYZ\v\f"; std::vector expected = {'_', 'a', 'b', 'c', 'd', 'e', '\t', 'X', 'Y', 'Z', '\v', '\f'}; test_buffer_input_source (src, expected); // BOM src = u8"\xef\xbb\xbfOK"; expected = {'O', 'K'}; test_buffer_input_source (src, expected); // Russian src = u8"приве́т"; expected = {L'п', L'р', L'и', L'в', 0x0435 /* CYRILLIC SMALL LETTER IE е */, 0x301 /* COMBINING ACUTE ACCENT ́ */, L'т'}; test_buffer_input_source (src, expected); src = u8"❤️🦀"; expected = {0x2764 /* HEAVY BLACK HEART */, 0xfe0f /* VARIATION SELECTOR-16 */, L'🦀'}; test_buffer_input_source (src, expected); src = u8"こんにちは"; expected = {L'こ', L'ん', L'に', L'ち', L'は'}; test_file_input_source (src, expected); src = u8"👮‍♂👩‍⚕"; expected = {0x1f46e /* POLICE OFFICER */, 0x200d /* ZERO WIDTH JOINER */, 0x2642 /* MALE SIGN */, 0x1f469 /* WOMAN */, 0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */}; test_file_input_source (src, expected); } } // namespace selftest #endif // CHECKING_P