Skip to content

Commit 8585797

Browse files
compiler: support new numeric literal syntax
Support 0b, 0o, and hex floats. Tested against test/literal2.go in the gc repo. Updates golang/go#12711 Updates golang/go#19308 Updates golang/go#28493 Updates golang/go#29008 Change-Id: I2ab01255f529880a2bd8603107e3e1ae03a7a3e5 Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/189718 Reviewed-by: Than McIntosh <[email protected]> Reviewed-by: Cherry Zhang <[email protected]>
1 parent 4b47cad commit 8585797

File tree

2 files changed

+223
-64
lines changed

2 files changed

+223
-64
lines changed

go/lex.cc

Lines changed: 216 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,26 @@ Lex::is_hex_digit(char c)
986986
|| (c >= 'a' && c <= 'f'));
987987
}
988988

989+
// Return whether C is a valid digit in BASE.
990+
991+
bool
992+
Lex::is_base_digit(int base, char c)
993+
{
994+
switch (base)
995+
{
996+
case 2:
997+
return c == '0' || c == '1';
998+
case 8:
999+
return c >= '0' && c <= '7';
1000+
case 10:
1001+
return c >= '0' && c <= '9';
1002+
case 16:
1003+
return Lex::is_hex_digit(c);
1004+
default:
1005+
go_unreachable();
1006+
}
1007+
}
1008+
9891009
// not a hex value
9901010
#define NHV 100
9911011

@@ -1032,13 +1052,24 @@ Lex::hex_val(char c)
10321052
return hex_value_lookup_table[static_cast<unsigned char>(c)];
10331053
}
10341054

1035-
// Return whether an exponent could start at P.
1055+
// Return whether an exponent could start at P, in base BASE.
10361056

10371057
bool
1038-
Lex::could_be_exponent(const char* p, const char* pend)
1058+
Lex::could_be_exponent(int base, const char* p, const char* pend)
10391059
{
1040-
if (*p != 'e' && *p != 'E')
1041-
return false;
1060+
switch (base)
1061+
{
1062+
case 10:
1063+
if (*p != 'e' && *p != 'E')
1064+
return false;
1065+
break;
1066+
case 16:
1067+
if (*p != 'p' && *p != 'P')
1068+
return false;
1069+
break;
1070+
default:
1071+
go_unreachable();
1072+
}
10421073
++p;
10431074
if (p >= pend)
10441075
return false;
@@ -1062,87 +1093,160 @@ Lex::gather_number()
10621093

10631094
Location location = this->location();
10641095

1065-
bool neg = false;
1066-
if (*p == '+')
1067-
++p;
1068-
else if (*p == '-')
1069-
{
1070-
++p;
1071-
neg = true;
1072-
}
1073-
1074-
const char* pnum = p;
1096+
int base = 10;
1097+
std::string num;
10751098
if (*p == '0')
10761099
{
1077-
int base;
1078-
if ((p[1] == 'x' || p[1] == 'X')
1079-
&& Lex::is_hex_digit(p[2]))
1100+
int basecheck;
1101+
int off;
1102+
if (p[1] == 'x' || p[1] == 'X')
10801103
{
10811104
base = 16;
1082-
p += 2;
1083-
pnum = p;
1084-
while (p < pend)
1085-
{
1086-
if (!Lex::is_hex_digit(*p))
1087-
break;
1088-
++p;
1089-
}
1105+
basecheck = 16;
1106+
off = 2;
1107+
}
1108+
else if (p[1] == 'o' || p[1] == 'O')
1109+
{
1110+
base = 8;
1111+
basecheck = 8;
1112+
off = 2;
1113+
}
1114+
else if (p[1] == 'b' || p[1] == 'B')
1115+
{
1116+
base = 2;
1117+
basecheck = 2;
1118+
off = 2;
10901119
}
10911120
else
10921121
{
1122+
// Old style octal literal. May also be the start of a
1123+
// floating-point number (e.g., 09.2, 09e2) or an imaginary
1124+
// literal (e.g., 09i), so we have to accept decimal digits.
10931125
base = 8;
1094-
pnum = p;
1095-
while (p < pend)
1096-
{
1097-
if (*p < '0' || *p > '9')
1098-
break;
1099-
++p;
1100-
}
1126+
basecheck = 10;
1127+
off = 0;
1128+
}
1129+
1130+
p += off;
1131+
if (*p == '_' && Lex::is_base_digit(basecheck, p[1]))
1132+
++p;
1133+
1134+
while (Lex::is_base_digit(basecheck, *p))
1135+
{
1136+
num.push_back(*p);
1137+
++p;
1138+
if (*p == '_' && Lex::is_base_digit(basecheck, p[1]))
1139+
++p;
1140+
}
1141+
1142+
// We must see at least one valid digit, except for a case like
1143+
// 0x.0p1.
1144+
if (num.length() == 0 && (base != 16 || *p != '.'))
1145+
{
1146+
go_error_at(this->location(), "invalid numeric literal");
1147+
this->lineoff_ = p - this->linebuf_;
1148+
mpz_t val;
1149+
mpz_init_set_ui(val, 0);
1150+
Token ret = Token::make_integer_token(val, location);
1151+
mpz_clear(val);
1152+
return ret;
1153+
}
1154+
1155+
bool is_float = false;
1156+
// A number that looks like an old-style octal literal might
1157+
// actually be the beginning of a floating-point or imaginary
1158+
// literal, in which case the value is decimal digits. Handle
1159+
// that case below by treating the leading '0' as decimal.
1160+
if (off == 0
1161+
&& (*p == '.' || *p == 'i' || Lex::could_be_exponent(10, p, pend)))
1162+
{
1163+
is_float = true;
1164+
base = 10;
11011165
}
1166+
else if (base == 16
1167+
&& (*p == '.' || Lex::could_be_exponent(16, p, pend)))
1168+
is_float = true;
11021169

1103-
// A partial token that looks like an octal literal might actually be the
1104-
// beginning of a floating-point or imaginary literal.
1105-
if (base == 16 || (*p != '.' && *p != 'i' && !Lex::could_be_exponent(p, pend)))
1170+
if (!is_float)
11061171
{
1107-
std::string s(pnum, p - pnum);
11081172
mpz_t val;
1109-
int r = mpz_init_set_str(val, s.c_str(), base);
1173+
int r = mpz_init_set_str(val, num.c_str(), base);
11101174
if (r != 0)
11111175
{
1112-
if (base == 8)
1113-
go_error_at(this->location(), "invalid octal literal");
1114-
else
1115-
go_error_at(this->location(), "invalid hex literal");
1176+
const char *errword;
1177+
switch (base)
1178+
{
1179+
case 2:
1180+
errword = "binary";
1181+
break;
1182+
case 8:
1183+
errword = "octal";
1184+
break;
1185+
case 16:
1186+
errword = "hex";
1187+
break;
1188+
default:
1189+
go_unreachable();
1190+
}
1191+
go_error_at(this->location(), "invalid %s literal", errword);
11161192
}
11171193

1118-
if (neg)
1119-
mpz_neg(val, val);
1194+
bool is_imaginary = *p == 'i';
1195+
if (is_imaginary)
1196+
++p;
11201197

11211198
this->lineoff_ = p - this->linebuf_;
1122-
Token ret = Token::make_integer_token(val, location);
1123-
mpz_clear(val);
1124-
return ret;
1199+
1200+
if (*p == 'e' || *p == 'E' || *p == 'p' || *p == 'P')
1201+
{
1202+
go_error_at(location,
1203+
"invalid prefix for floating constant");
1204+
this->skip_exponent();
1205+
}
1206+
1207+
if (!is_imaginary)
1208+
{
1209+
Token ret = Token::make_integer_token(val, location);
1210+
mpz_clear(val);
1211+
return ret;
1212+
}
1213+
else
1214+
{
1215+
mpfr_t ival;
1216+
mpfr_init_set_z(ival, val, GMP_RNDN);
1217+
mpz_clear(val);
1218+
Token ret = Token::make_imaginary_token(ival, location);
1219+
mpfr_clear(ival);
1220+
return ret;
1221+
}
11251222
}
11261223
}
11271224

11281225
while (p < pend)
11291226
{
1130-
if (*p < '0' || *p > '9')
1227+
if (*p == '_' && p[1] >= '0' && p[1] <= '9')
1228+
++p;
1229+
else if (*p < '0' || *p > '9')
11311230
break;
1231+
num.push_back(*p);
11321232
++p;
11331233
}
11341234

1135-
if (*p != '.' && *p != 'i' && !Lex::could_be_exponent(p, pend))
1235+
if (*p != '.' && *p != 'i' && !Lex::could_be_exponent(base, p, pend))
11361236
{
1137-
std::string s(pnum, p - pnum);
11381237
mpz_t val;
1139-
int r = mpz_init_set_str(val, s.c_str(), 10);
1238+
int r = mpz_init_set_str(val, num.c_str(), 10);
11401239
go_assert(r == 0);
11411240

1142-
if (neg)
1143-
mpz_neg(val, val);
1144-
11451241
this->lineoff_ = p - this->linebuf_;
1242+
1243+
if (*p == 'e' || *p == 'E' || *p == 'p' || *p == 'P')
1244+
{
1245+
go_error_at(location,
1246+
"invalid prefix for floating constant");
1247+
this->skip_exponent();
1248+
}
1249+
11461250
Token ret = Token::make_integer_token(val, location);
11471251
mpz_clear(val);
11481252
return ret;
@@ -1152,48 +1256,76 @@ Lex::gather_number()
11521256
{
11531257
bool dot = *p == '.';
11541258

1259+
num.push_back(*p);
11551260
++p;
11561261

11571262
if (!dot)
11581263
{
11591264
if (*p == '+' || *p == '-')
1160-
++p;
1265+
{
1266+
num.push_back(*p);
1267+
++p;
1268+
}
11611269
}
11621270

1271+
bool first = true;
11631272
while (p < pend)
11641273
{
1165-
if (*p < '0' || *p > '9')
1274+
if (!first && *p == '_' && Lex::is_base_digit(base, p[1]))
1275+
++p;
1276+
else if (!Lex::is_base_digit(base, *p))
11661277
break;
1278+
num.push_back(*p);
11671279
++p;
1280+
first = false;
11681281
}
11691282

1170-
if (dot && Lex::could_be_exponent(p, pend))
1283+
if (dot && Lex::could_be_exponent(base, p, pend))
11711284
{
1285+
num.push_back(*p);
11721286
++p;
11731287
if (*p == '+' || *p == '-')
1174-
++p;
1288+
{
1289+
num.push_back(*p);
1290+
++p;
1291+
}
1292+
first = true;
11751293
while (p < pend)
11761294
{
1177-
if (*p < '0' || *p > '9')
1295+
if (!first && *p == '_' && p[1] >= '0' && p[1] <= '9')
1296+
++p;
1297+
else if (*p < '0' || *p > '9')
11781298
break;
1299+
num.push_back(*p);
11791300
++p;
1301+
first = false;
11801302
}
11811303
}
1304+
else if (dot && base == 16)
1305+
{
1306+
go_error_at(this->location(),
1307+
"invalid hex floating-point literal with no exponent");
1308+
num.append("p0");
1309+
}
11821310
}
11831311

1184-
std::string s(pnum, p - pnum);
11851312
mpfr_t val;
1186-
int r = mpfr_init_set_str(val, s.c_str(), 10, GMP_RNDN);
1313+
int r = mpfr_init_set_str(val, num.c_str(), base, GMP_RNDN);
11871314
go_assert(r == 0);
11881315

1189-
if (neg)
1190-
mpfr_neg(val, val, GMP_RNDN);
1191-
11921316
bool is_imaginary = *p == 'i';
11931317
if (is_imaginary)
11941318
++p;
11951319

11961320
this->lineoff_ = p - this->linebuf_;
1321+
1322+
if (*p == 'e' || *p == 'E' || *p == 'p' || *p == 'P')
1323+
{
1324+
go_error_at(location,
1325+
"invalid prefix for floating constant");
1326+
this->skip_exponent();
1327+
}
1328+
11971329
if (is_imaginary)
11981330
{
11991331
Token ret = Token::make_imaginary_token(val, location);
@@ -1208,6 +1340,27 @@ Lex::gather_number()
12081340
}
12091341
}
12101342

1343+
// Skip an exponent after reporting an error.
1344+
1345+
void
1346+
Lex::skip_exponent()
1347+
{
1348+
const char* p = this->linebuf_ + this->lineoff_;
1349+
const char* pend = this->linebuf_ + this->linesize_;
1350+
if (*p != 'e' && *p != 'E' && *p != 'p' && *p != 'P')
1351+
return;
1352+
++p;
1353+
if (*p == '+' || *p == '-')
1354+
++p;
1355+
while (p < pend)
1356+
{
1357+
if ((*p < '0' || *p > '9') && *p != '_')
1358+
break;
1359+
++p;
1360+
}
1361+
this->lineoff_ = p - this->linebuf_;
1362+
}
1363+
12111364
// Advance one character, possibly escaped. Return the pointer beyond
12121365
// the character. Set *VALUE to the character. Set *IS_CHARACTER if
12131366
// this is a character (e.g., 'a' or '\u1234') rather than a byte

0 commit comments

Comments
 (0)