changed
CHANGELOG.md
|
@@ -1,5 +1,9 @@
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+ ## v1.1.0
|
4
|
+
|
5
|
+ * Add PO Parser Metadata Stripping
|
6
|
+
|
3
7
|
## v1.0.1
|
4
8
|
|
5
9
|
* Fix backslash escaping in `Expo.PO.compose/1`
|
changed
hex_metadata.config
|
@@ -4,7 +4,7 @@
|
4
4
|
{<<"GitHub">>,<<"https://github.com/elixir-gettext/expo">>},
|
5
5
|
{<<"Issues">>,<<"https://github.com/elixir-gettext/expo/issues">>}]}.
|
6
6
|
{<<"name">>,<<"expo">>}.
|
7
|
- {<<"version">>,<<"1.0.1">>}.
|
7
|
+ {<<"version">>,<<"1.1.0">>}.
|
8
8
|
{<<"description">>,
|
9
9
|
<<"Low-level Gettext file handling (.po/.pot/.mo file writer and parser).">>}.
|
10
10
|
{<<"elixir">>,<<"~> 1.11">>}.
|
changed
lib/expo/po.ex
|
@@ -6,7 +6,17 @@ defmodule Expo.PO do
|
6
6
|
alias Expo.Messages
|
7
7
|
alias Expo.PO.{DuplicateMessagesError, Parser, SyntaxError}
|
8
8
|
|
9
|
- @type parse_option :: {:file, Path.t()}
|
9
|
+ @typedoc """
|
10
|
+ Parsing option.
|
11
|
+
|
12
|
+ * `:file` (`t:Path.t/0`) - path to use in error messages when using `parse_string/2`. If not present, errors
|
13
|
+ don't have a path.
|
14
|
+
|
15
|
+ * `:strip_meta` (`t:boolean/0`) - include only messages (no comments and other metadata) from the `.po` file
|
16
|
+ to reduce memory usage when meta information is not needed.
|
17
|
+ Defaults to `false`.
|
18
|
+ """
|
19
|
+ @type parse_option :: {:file, Path.t()} | {:strip_meta, boolean()}
|
10
20
|
|
11
21
|
@doc """
|
12
22
|
Dumps a `Expo.Messages` struct as iodata.
|
changed
lib/expo/po/parser.ex
|
@@ -12,7 +12,7 @@ defmodule Expo.PO.Parser do
|
12
12
|
def parse(content, opts) do
|
13
13
|
content = prune_bom(content, Keyword.get(opts, :file, "nofile"))
|
14
14
|
|
15
|
- with {:ok, tokens} <- tokenize(content),
|
15
|
+ with {:ok, tokens} <- tokenize(content, opts),
|
16
16
|
{:ok, po} <- parse_tokens(tokens),
|
17
17
|
{:ok, po} <- check_for_duplicates(po) do
|
18
18
|
{:ok, %Messages{po | file: Keyword.get(opts, :file)}}
|
|
@@ -22,8 +22,8 @@ defmodule Expo.PO.Parser do
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
- defp tokenize(content) do
|
26
|
- case Tokenizer.tokenize(content) do
|
25
|
+ defp tokenize(content, opts) do
|
26
|
+ case Tokenizer.tokenize(content, opts) do
|
27
27
|
{:ok, tokens} -> {:ok, tokens}
|
28
28
|
{:error, line, message} -> {:error, %SyntaxError{line: line, reason: message}}
|
29
29
|
end
|
changed
lib/expo/po/tokenizer.ex
|
@@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do
|
52
52
|
* `{:str, 6, "foo"}`
|
53
53
|
|
54
54
|
"""
|
55
|
- @spec tokenize(binary) :: {:ok, [token]} | {:error, pos_integer, binary}
|
56
|
- def tokenize(str) do
|
57
|
- tokenize_line(str, _line = 1, _tokens_acc = [])
|
55
|
+ @spec tokenize(binary, [Expo.PO.parse_option()]) ::
|
56
|
+ {:ok, [token]} | {:error, pos_integer, binary}
|
57
|
+ def tokenize(str, opts \\ []) do
|
58
|
+ strip_meta? = Keyword.get(opts, :strip_meta, false)
|
59
|
+ tokenize_line(str, _line = 1, strip_meta?, _tokens_acc = [])
|
58
60
|
end
|
59
61
|
|
60
62
|
# Reverse str_lines strings.
|
|
@@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do
|
86
88
|
end
|
87
89
|
|
88
90
|
# End of file.
|
89
|
- defp tokenize_line(<<>>, line, acc) do
|
91
|
+ defp tokenize_line(<<>>, line, _strip_meta?, acc) do
|
90
92
|
{:ok, [{:"$end", line} | acc] |> Enum.reverse() |> postprocess_tokens()}
|
91
93
|
end
|
92
94
|
|
93
95
|
# Go to the next line.
|
94
|
- defp tokenize_line(<<?\n, rest::binary>>, line, acc) do
|
95
|
- tokenize_line(rest, line + 1, acc)
|
96
|
+ defp tokenize_line(<<?\n, rest::binary>>, line, strip_meta?, acc) do
|
97
|
+ tokenize_line(rest, line + 1, strip_meta?, acc)
|
96
98
|
end
|
97
99
|
|
98
100
|
# Skip other whitespace.
|
99
|
- defp tokenize_line(<<char, rest::binary>>, line, acc)
|
101
|
+ defp tokenize_line(<<char, rest::binary>>, line, strip_meta?, acc)
|
100
102
|
when char in @whitespace_no_nl do
|
101
|
- tokenize_line(rest, line, acc)
|
103
|
+ tokenize_line(rest, line, strip_meta?, acc)
|
104
|
+ end
|
105
|
+
|
106
|
+ # Skip Meta Information when strip_meta is enabled
|
107
|
+ defp tokenize_line(<<?#, rest::binary>>, line, true, acc) do
|
108
|
+ from_next_line = discard_until_nl(rest)
|
109
|
+ tokenize_line(from_next_line, line, true, acc)
|
102
110
|
end
|
103
111
|
|
104
112
|
# Obsolete comment.
|
105
|
- defp tokenize_line(<<"#~", rest::binary>>, line, acc) do
|
106
|
- tokenize_line(rest, line, [{:obsolete, line} | acc])
|
113
|
+ defp tokenize_line(<<"#~", rest::binary>>, line, strip_meta?, acc) do
|
114
|
+ tokenize_line(rest, line, strip_meta?, [{:obsolete, line} | acc])
|
107
115
|
end
|
108
116
|
|
109
117
|
# Previous comment.
|
110
|
- defp tokenize_line(<<"#|", rest::binary>>, line, acc) do
|
111
|
- tokenize_line(rest, line, [{:previous, line} | acc])
|
118
|
+ defp tokenize_line(<<"#|", rest::binary>>, line, strip_meta?, acc) do
|
119
|
+ tokenize_line(rest, line, strip_meta?, [{:previous, line} | acc])
|
112
120
|
end
|
113
121
|
|
114
122
|
# Normal comment.
|
115
|
- defp tokenize_line(<<?#, _rest::binary>> = rest, line, acc) do
|
123
|
+ defp tokenize_line(<<?#, _rest::binary>> = rest, line, strip_meta?, acc) do
|
116
124
|
{contents, rest} = to_eol_or_eof(rest, "")
|
117
|
- tokenize_line(rest, line, [{:comment, line, contents} | acc])
|
125
|
+ tokenize_line(rest, line, strip_meta?, [{:comment, line, contents} | acc])
|
118
126
|
end
|
119
127
|
|
120
128
|
# Keywords.
|
121
129
|
for kw <- @string_keywords do
|
122
|
- defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, acc)
|
130
|
+ defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, strip_meta?, acc)
|
123
131
|
when char in @whitespace do
|
124
132
|
acc = [{unquote(String.to_existing_atom(kw)), line} | acc]
|
125
|
- tokenize_line(rest, line, acc)
|
133
|
+ tokenize_line(rest, line, strip_meta?, acc)
|
126
134
|
end
|
127
135
|
|
128
|
- defp tokenize_line(unquote(kw) <> _rest, line, _acc) do
|
136
|
+ defp tokenize_line(unquote(kw) <> _rest, line, _strip_meta?, _acc) do
|
129
137
|
{:error, line, "no space after '#{unquote(kw)}'"}
|
130
138
|
end
|
131
139
|
end
|
132
140
|
|
133
141
|
# `msgstr`.
|
134
|
- defp tokenize_line("msgstr[" <> <<rest::binary>>, line, acc) do
|
142
|
+ defp tokenize_line("msgstr[" <> <<rest::binary>>, line, strip_meta?, acc) do
|
135
143
|
case tokenize_plural_form(rest, "") do
|
136
144
|
{:ok, plural_form, rest} ->
|
137
145
|
# The order of the :plural_form and :msgstr tokens is inverted since
|
138
146
|
# the `acc` array of tokens will be reversed at the end.
|
139
147
|
acc = [{:plural_form, line, plural_form}, {:msgstr, line} | acc]
|
140
|
- tokenize_line(rest, line, acc)
|
148
|
+ tokenize_line(rest, line, strip_meta?, acc)
|
141
149
|
|
142
150
|
{:error, reason} ->
|
143
151
|
{:error, line, reason}
|
144
152
|
end
|
145
153
|
end
|
146
154
|
|
147
|
- defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, acc)
|
155
|
+ defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, strip_meta?, acc)
|
148
156
|
when char in @whitespace do
|
149
157
|
acc = [{:msgstr, line} | acc]
|
150
|
- tokenize_line(rest, line, acc)
|
158
|
+ tokenize_line(rest, line, strip_meta?, acc)
|
151
159
|
end
|
152
160
|
|
153
|
- defp tokenize_line("msgstr" <> _rest, line, _acc) do
|
161
|
+ defp tokenize_line("msgstr" <> _rest, line, _strip_meta?, _acc) do
|
154
162
|
{:error, line, "no space after 'msgstr'"}
|
155
163
|
end
|
156
164
|
|
157
165
|
# String.
|
158
|
- defp tokenize_line(<<?", rest::binary>>, line, acc) do
|
166
|
+ defp tokenize_line(<<?", rest::binary>>, line, strip_meta?, acc) do
|
159
167
|
case tokenize_string(rest, "") do
|
160
168
|
{:ok, string, rest} ->
|
161
|
- tokenize_line(rest, line, add_str_lines(line, string, acc))
|
169
|
+ tokenize_line(rest, line, strip_meta?, add_str_lines(line, string, acc))
|
162
170
|
|
163
171
|
{:error, reason} ->
|
164
172
|
{:error, line, reason}
|
|
@@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do
|
170
178
|
# a letter (we don't take care of unicode or fancy stuff, just ASCII letters),
|
171
179
|
# we assume there's an unknown keyword. We parse it with a regex
|
172
180
|
# so that the error message is informative.
|
173
|
- defp tokenize_line(<<letter, _rest::binary>> = binary, line, _acc)
|
181
|
+ defp tokenize_line(<<letter, _rest::binary>> = binary, line, _strip_meta?, _acc)
|
174
182
|
when letter in ?a..?z or letter in ?A..?Z do
|
175
183
|
next_word = List.first(Regex.run(~r/\w+/u, binary))
|
176
184
|
{:error, line, "unknown keyword '#{next_word}'"}
|
|
@@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do
|
180
188
|
# Last resort: this is just a plain unexpected token. We take the first
|
181
189
|
# Unicode char of the given binary and build an informative error message
|
182
190
|
# (with the codepoint of the char).
|
183
|
- defp tokenize_line(binary, line, _acc) when is_binary(binary) do
|
191
|
+ defp tokenize_line(binary, line, _strip_meta?, _acc) when is_binary(binary) do
|
184
192
|
# To get the first Unicode char, we convert to char list first.
|
185
193
|
[char | _] = String.to_charlist(binary)
|
186
194
|
msg = :io_lib.format(~c"unexpected token: \"~ts\" (codepoint U+~4.16.0B)", [[char], char])
|
187
195
|
{:error, line, :unicode.characters_to_binary(msg)}
|
188
196
|
end
|
189
197
|
|
198
|
+ defp discard_until_nl(content)
|
199
|
+ defp discard_until_nl(<<?\n, _rest::binary>> = content), do: content
|
200
|
+ defp discard_until_nl(<<>>), do: <<>>
|
201
|
+ defp discard_until_nl(<<_char, rest::binary>>), do: discard_until_nl(rest)
|
202
|
+
|
190
203
|
@obsolete_keywords ~w(msgid msgid_plural msgctxt msgstr)a
|
191
204
|
|
192
205
|
# Collapse the string into the previous str_lines token if there is one *on the same line*.
|
changed
mix.exs
|
@@ -2,7 +2,7 @@
|
2
2
|
defmodule Expo.MixProject do
|
3
3
|
use Mix.Project
|
4
4
|
|
5
|
- @version "1.0.1"
|
5
|
+ @version "1.1.0"
|
6
6
|
@source_url "https://github.com/elixir-gettext/expo"
|
7
7
|
@description "Low-level Gettext file handling (.po/.pot/.mo file writer and parser)."
|