diff --git a/lib/elixir/src/elixir_parser.yrl b/lib/elixir/src/elixir_parser.yrl index 4f3bc99a5a..70fa6bd7e7 100644 --- a/lib/elixir/src/elixir_parser.yrl +++ b/lib/elixir/src/elixir_parser.yrl @@ -1016,7 +1016,13 @@ build_bin_string({bin_string, Location, Args}, ExtraMeta) -> {'<<>>', Meta, string_parts(Args)}. build_list_string({list_string, _Location, [H]} = Token, ExtraMeta) when is_binary(H) -> - handle_literal(elixir_utils:characters_to_list(H), Token, ExtraMeta); + try + List = elixir_utils:characters_to_list(H), + handle_literal(List, Token, ExtraMeta) + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + return_error(?location(Token), elixir_utils:characters_to_list(Message), "'") + end; build_list_string({list_string, Location, Args}, ExtraMeta) -> Meta = meta_from_location(Location), MetaWithExtra = diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl index 578bc340aa..59f6383b57 100644 --- a/lib/elixir/src/elixir_tokenizer.erl +++ b/lib/elixir/src/elixir_tokenizer.erl @@ -1023,15 +1023,33 @@ is_unnecessary_quote(_Parts, _Scope) -> unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{}) when is_binary(Part) andalso byte_size(Part) > 255; is_list(Part) andalso length(Part) > 255 -> - {error, {?LOC(Line, Column), "atom length must be less than system limit: ", elixir_utils:characters_to_list(Part)}}; + try + PartList = elixir_utils:characters_to_list(Part), + {error, {?LOC(Line, Column), "atom length must be less than system limit: ", PartList}} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}} + end; unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{static_atoms_encoder=StaticAtomsEncoder}) when is_function(StaticAtomsEncoder) -> - Value = elixir_utils:characters_to_binary(Part), - case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of - {ok, Term} -> - {ok, Term}; - {error, Reason} when is_binary(Reason) -> - {error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", elixir_utils:characters_to_list(Part)}} + EncodeResult = try + ValueEncBin = elixir_utils:characters_to_binary(Part), + ValueEncList = elixir_utils:characters_to_list(Part), + {ok, ValueEncBin, ValueEncList} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}} + end, + + case EncodeResult of + {ok, Value, ValueList} -> + case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of + {ok, Term} -> + {ok, Term}; + {error, Reason} when is_binary(Reason) -> + {error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", ValueList}} + end; + EncError -> EncError end; unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_binary(Binary) -> try @@ -1039,9 +1057,14 @@ unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true} catch error:badarg -> % Check if it's a UTF-8 issue by trying to convert to list - elixir_utils:characters_to_list(Binary), - % If we get here, it's not a UTF-8 issue - {error, {?LOC(Line, Column), "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}} + try + List = elixir_utils:characters_to_list(Binary), + % If we get here, it's not a UTF-8 issue + {error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}} + end end; unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{}) when is_binary(Binary) -> try @@ -1049,9 +1072,14 @@ unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{}) when is_binary(Binary) catch error:badarg -> % Try to convert using elixir_utils to get proper UnicodeConversionError - elixir_utils:characters_to_list(Binary), - % If we get here, it's not a UTF-8 issue, so it's some other badarg - {error, {?LOC(Line, Column), "invalid atom: ", elixir_utils:characters_to_list(Binary)}} + try + List = elixir_utils:characters_to_list(Binary), + % If we get here, it's not a UTF-8 issue, so it's some other badarg + {error, {?LOC(Line, Column), "invalid atom: ", List}} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}} + end end; unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_list(List) -> try @@ -1059,9 +1087,14 @@ unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) catch error:badarg -> % Try to convert using elixir_utils to get proper UnicodeConversionError - elixir_utils:characters_to_binary(List), - % If we get here, it's not a UTF-8 issue - {error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}} + try + elixir_utils:characters_to_binary(List), + % If we get here, it's not a UTF-8 issue + {error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}} + end end; unsafe_to_atom(List, Line, Column, #elixir_tokenizer{}) when is_list(List) -> try @@ -1069,9 +1102,14 @@ unsafe_to_atom(List, Line, Column, #elixir_tokenizer{}) when is_list(List) -> catch error:badarg -> % Try to convert using elixir_utils to get proper UnicodeConversionError - elixir_utils:characters_to_binary(List), - % If we get here, it's not a UTF-8 issue, so it's some other badarg - {error, {?LOC(Line, Column), "invalid atom: ", List}} + try + elixir_utils:characters_to_binary(List), + % If we get here, it's not a UTF-8 issue, so it's some other badarg + {error, {?LOC(Line, Column), "invalid atom: ", List}} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}} + end end. collect_modifiers([H | T], Buffer) when ?is_downcase(H) or ?is_upcase(H) or ?is_digit(H) -> @@ -1095,7 +1133,12 @@ extract_heredoc_with_interpolation(Line, Column, Scope, Interpol, T, H) -> {Parts1, {ShouldWarn, _}} = lists:mapfoldl(Fun, {false, Line}, Parts0), Parts2 = extract_heredoc_head(Parts1), NewScope = maybe_heredoc_warn(ShouldWarn, Column, InterScope, H), - {ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope}; + try + {ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, interpolation_format(Message, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])} + end; {error, Reason} -> {error, interpolation_format(Reason, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])} @@ -1166,8 +1209,13 @@ unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=true}) -> {error, Message, Token} -> {error, {?LOC(Line, Column), Message ++ ". Syntax error after: ", Token}} end; -unescape_tokens(Tokens, _Line, _Column, #elixir_tokenizer{unescape=false}) -> - {ok, tokens_to_binary(Tokens)}. +unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=false}) -> + try + {ok, tokens_to_binary(Tokens)} + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + {error, {?LOC(Line, Column), "invalid encoding in tokens: ", elixir_utils:characters_to_list(Message)}} + end. tokens_to_binary(Tokens) -> [if is_list(Token) -> elixir_utils:characters_to_binary(Token); true -> Token end @@ -1671,7 +1719,14 @@ tokenize_sigil_contents([H | T] = Original, [S | _] = SigilName, Line, Column, S case elixir_interpolation:extract(Line, Column + 1, Scope, ?is_downcase(S), T, sigil_terminator(H)) of {NewLine, NewColumn, Parts, Rest, NewScope} -> Indentation = nil, - add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <>); + try + add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <>) + catch + error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} -> + Sigil = [$~, S, H], + Message = " (for sigil ~ts starting at line ~B)", + interpolation_error(Message, [$~] ++ SigilName ++ Original, Scope, Tokens, Message, [Sigil, Line], Line, Column, [H], [sigil_terminator(H)]) + end; {error, Reason} -> Sigil = [$~, S, H], diff --git a/lib/elixir/test/elixir/code_test.exs b/lib/elixir/test/elixir/code_test.exs index 5be948fa38..23bc49f0ac 100644 --- a/lib/elixir/test/elixir/code_test.exs +++ b/lib/elixir/test/elixir/code_test.exs @@ -556,24 +556,47 @@ defmodule CodeTest do assert token4 == "\\u" end - test "string_to_quoted raises UnicodeConversionError for invalid UTF-8 in quoted atoms and function calls" do + test "string_to_quoted returns error for invalid UTF-8 in strings" do invalid_utf8_cases = [ + # charlist + "'\\xFF'", + # charlist heredoc + "'''\n\\xFF\\\n'''" + ] + + for code <- invalid_utf8_cases do + assert {:error, {_, message, _}} = Code.string_to_quoted(code) + assert message =~ "invalid encoding starting at <<255>>" + end + end + + test "string_to_quoted returns error for invalid UTF-8 in quoted atoms and function calls" do + invalid_utf8_cases = [ + # charlist + # ~S{'\xFF'}, + # charlist heredoc + # ~s{'''\n\xFF\n'''}, # Quoted atom ~S{:"\xFF"}, ~S{:'\xFF'}, + # Quoted keyword identifier + ~S{["\xFF": 1]}, + ~S{['\xFF': 1]}, # Quoted function call ~S{foo."\xFF"()}, ~S{foo.'\xFF'()} ] for code <- invalid_utf8_cases do - assert_raise UnicodeConversionError, fn -> - Code.string_to_quoted!(code) - end + assert {:error, {_, message, detail}} = Code.string_to_quoted(code) + assert message =~ "invalid encoding in atom: " + assert detail =~ "invalid encoding starting at <<255>>" - assert_raise UnicodeConversionError, fn -> - Code.string_to_quoted!(code, existing_atoms_only: true) - end + assert {:error, {_, message, detail}} = + Code.string_to_quoted(code, existing_atoms_only: true) + + assert message =~ "invalid encoding in atom: " + assert detail =~ "invalid encoding starting at <<255>>" end end