changed CHANGELOG.md
 
@@ -1,5 +1,9 @@
1
1
# Changelog
2
2
3
+ ## v1.1.0
4
+
5
+ * Add PO Parser Metadata Stripping
6
+
3
7
## v1.0.1
4
8
5
9
* Fix backslash escaping in `Expo.PO.compose/1`
changed hex_metadata.config
 
@@ -4,7 +4,7 @@
4
4
{<<"GitHub">>,<<"https://github.com/elixir-gettext/expo">>},
5
5
{<<"Issues">>,<<"https://github.com/elixir-gettext/expo/issues">>}]}.
6
6
{<<"name">>,<<"expo">>}.
7
- {<<"version">>,<<"1.0.1">>}.
7
+ {<<"version">>,<<"1.1.0">>}.
8
8
{<<"description">>,
9
9
<<"Low-level Gettext file handling (.po/.pot/.mo file writer and parser).">>}.
10
10
{<<"elixir">>,<<"~> 1.11">>}.
changed lib/expo/po.ex
 
@@ -6,7 +6,17 @@ defmodule Expo.PO do
6
6
alias Expo.Messages
7
7
alias Expo.PO.{DuplicateMessagesError, Parser, SyntaxError}
8
8
9
- @type parse_option :: {:file, Path.t()}
9
+ @typedoc """
10
+ Parsing option.
11
+
12
+ * `:file` (`t:Path.t/0`) - path to use in error messages when using `parse_string/2`. If not present, errors
13
+ don't have a path.
14
+
15
+ * `:strip_meta` (`t:boolean/0`) - include only messages (no comments and other metadata) from the `.po` file
16
+ to reduce memory usage when meta information is not needed.
17
+ Defaults to `false`.
18
+ """
19
+ @type parse_option :: {:file, Path.t()} | {:strip_meta, boolean()}
10
20
11
21
@doc """
12
22
Dumps a `Expo.Messages` struct as iodata.
changed lib/expo/po/parser.ex
 
@@ -12,7 +12,7 @@ defmodule Expo.PO.Parser do
12
12
def parse(content, opts) do
13
13
content = prune_bom(content, Keyword.get(opts, :file, "nofile"))
14
14
15
- with {:ok, tokens} <- tokenize(content),
15
+ with {:ok, tokens} <- tokenize(content, opts),
16
16
{:ok, po} <- parse_tokens(tokens),
17
17
{:ok, po} <- check_for_duplicates(po) do
18
18
{:ok, %Messages{po | file: Keyword.get(opts, :file)}}
 
@@ -22,8 +22,8 @@ defmodule Expo.PO.Parser do
22
22
end
23
23
end
24
24
25
- defp tokenize(content) do
26
- case Tokenizer.tokenize(content) do
25
+ defp tokenize(content, opts) do
26
+ case Tokenizer.tokenize(content, opts) do
27
27
{:ok, tokens} -> {:ok, tokens}
28
28
{:error, line, message} -> {:error, %SyntaxError{line: line, reason: message}}
29
29
end
changed lib/expo/po/tokenizer.ex
 
@@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do
52
52
* `{:str, 6, "foo"}`
53
53
54
54
"""
55
- @spec tokenize(binary) :: {:ok, [token]} | {:error, pos_integer, binary}
56
- def tokenize(str) do
57
- tokenize_line(str, _line = 1, _tokens_acc = [])
55
+ @spec tokenize(binary, [Expo.PO.parse_option()]) ::
56
+ {:ok, [token]} | {:error, pos_integer, binary}
57
+ def tokenize(str, opts \\ []) do
58
+ strip_meta? = Keyword.get(opts, :strip_meta, false)
59
+ tokenize_line(str, _line = 1, strip_meta?, _tokens_acc = [])
58
60
end
59
61
60
62
# Reverse str_lines strings.
 
@@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do
86
88
end
87
89
88
90
# End of file.
89
- defp tokenize_line(<<>>, line, acc) do
91
+ defp tokenize_line(<<>>, line, _strip_meta?, acc) do
90
92
{:ok, [{:"$end", line} | acc] |> Enum.reverse() |> postprocess_tokens()}
91
93
end
92
94
93
95
# Go to the next line.
94
- defp tokenize_line(<<?\n, rest::binary>>, line, acc) do
95
- tokenize_line(rest, line + 1, acc)
96
+ defp tokenize_line(<<?\n, rest::binary>>, line, strip_meta?, acc) do
97
+ tokenize_line(rest, line + 1, strip_meta?, acc)
96
98
end
97
99
98
100
# Skip other whitespace.
99
- defp tokenize_line(<<char, rest::binary>>, line, acc)
101
+ defp tokenize_line(<<char, rest::binary>>, line, strip_meta?, acc)
100
102
when char in @whitespace_no_nl do
101
- tokenize_line(rest, line, acc)
103
+ tokenize_line(rest, line, strip_meta?, acc)
104
+ end
105
+
106
+ # Skip Meta Information when strip_meta is enabled
107
+ defp tokenize_line(<<?#, rest::binary>>, line, true, acc) do
108
+ from_next_line = discard_until_nl(rest)
109
+ tokenize_line(from_next_line, line, true, acc)
102
110
end
103
111
104
112
# Obsolete comment.
105
- defp tokenize_line(<<"#~", rest::binary>>, line, acc) do
106
- tokenize_line(rest, line, [{:obsolete, line} | acc])
113
+ defp tokenize_line(<<"#~", rest::binary>>, line, strip_meta?, acc) do
114
+ tokenize_line(rest, line, strip_meta?, [{:obsolete, line} | acc])
107
115
end
108
116
109
117
# Previous comment.
110
- defp tokenize_line(<<"#|", rest::binary>>, line, acc) do
111
- tokenize_line(rest, line, [{:previous, line} | acc])
118
+ defp tokenize_line(<<"#|", rest::binary>>, line, strip_meta?, acc) do
119
+ tokenize_line(rest, line, strip_meta?, [{:previous, line} | acc])
112
120
end
113
121
114
122
# Normal comment.
115
- defp tokenize_line(<<?#, _rest::binary>> = rest, line, acc) do
123
+ defp tokenize_line(<<?#, _rest::binary>> = rest, line, strip_meta?, acc) do
116
124
{contents, rest} = to_eol_or_eof(rest, "")
117
- tokenize_line(rest, line, [{:comment, line, contents} | acc])
125
+ tokenize_line(rest, line, strip_meta?, [{:comment, line, contents} | acc])
118
126
end
119
127
120
128
# Keywords.
121
129
for kw <- @string_keywords do
122
- defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, acc)
130
+ defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, strip_meta?, acc)
123
131
when char in @whitespace do
124
132
acc = [{unquote(String.to_existing_atom(kw)), line} | acc]
125
- tokenize_line(rest, line, acc)
133
+ tokenize_line(rest, line, strip_meta?, acc)
126
134
end
127
135
128
- defp tokenize_line(unquote(kw) <> _rest, line, _acc) do
136
+ defp tokenize_line(unquote(kw) <> _rest, line, _strip_meta?, _acc) do
129
137
{:error, line, "no space after '#{unquote(kw)}'"}
130
138
end
131
139
end
132
140
133
141
# `msgstr`.
134
- defp tokenize_line("msgstr[" <> <<rest::binary>>, line, acc) do
142
+ defp tokenize_line("msgstr[" <> <<rest::binary>>, line, strip_meta?, acc) do
135
143
case tokenize_plural_form(rest, "") do
136
144
{:ok, plural_form, rest} ->
137
145
# The order of the :plural_form and :msgstr tokens is inverted since
138
146
# the `acc` array of tokens will be reversed at the end.
139
147
acc = [{:plural_form, line, plural_form}, {:msgstr, line} | acc]
140
- tokenize_line(rest, line, acc)
148
+ tokenize_line(rest, line, strip_meta?, acc)
141
149
142
150
{:error, reason} ->
143
151
{:error, line, reason}
144
152
end
145
153
end
146
154
147
- defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, acc)
155
+ defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, strip_meta?, acc)
148
156
when char in @whitespace do
149
157
acc = [{:msgstr, line} | acc]
150
- tokenize_line(rest, line, acc)
158
+ tokenize_line(rest, line, strip_meta?, acc)
151
159
end
152
160
153
- defp tokenize_line("msgstr" <> _rest, line, _acc) do
161
+ defp tokenize_line("msgstr" <> _rest, line, _strip_meta?, _acc) do
154
162
{:error, line, "no space after 'msgstr'"}
155
163
end
156
164
157
165
# String.
158
- defp tokenize_line(<<?", rest::binary>>, line, acc) do
166
+ defp tokenize_line(<<?", rest::binary>>, line, strip_meta?, acc) do
159
167
case tokenize_string(rest, "") do
160
168
{:ok, string, rest} ->
161
- tokenize_line(rest, line, add_str_lines(line, string, acc))
169
+ tokenize_line(rest, line, strip_meta?, add_str_lines(line, string, acc))
162
170
163
171
{:error, reason} ->
164
172
{:error, line, reason}
 
@@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do
170
178
# a letter (we don't take care of unicode or fancy stuff, just ASCII letters),
171
179
# we assume there's an unknown keyword. We parse it with a regex
172
180
# so that the error message is informative.
173
- defp tokenize_line(<<letter, _rest::binary>> = binary, line, _acc)
181
+ defp tokenize_line(<<letter, _rest::binary>> = binary, line, _strip_meta?, _acc)
174
182
when letter in ?a..?z or letter in ?A..?Z do
175
183
next_word = List.first(Regex.run(~r/\w+/u, binary))
176
184
{:error, line, "unknown keyword '#{next_word}'"}
 
@@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do
180
188
# Last resort: this is just a plain unexpected token. We take the first
181
189
# Unicode char of the given binary and build an informative error message
182
190
# (with the codepoint of the char).
183
- defp tokenize_line(binary, line, _acc) when is_binary(binary) do
191
+ defp tokenize_line(binary, line, _strip_meta?, _acc) when is_binary(binary) do
184
192
# To get the first Unicode char, we convert to char list first.
185
193
[char | _] = String.to_charlist(binary)
186
194
msg = :io_lib.format(~c"unexpected token: \"~ts\" (codepoint U+~4.16.0B)", [[char], char])
187
195
{:error, line, :unicode.characters_to_binary(msg)}
188
196
end
189
197
198
+ defp discard_until_nl(content)
199
+ defp discard_until_nl(<<?\n, _rest::binary>> = content), do: content
200
+ defp discard_until_nl(<<>>), do: <<>>
201
+ defp discard_until_nl(<<_char, rest::binary>>), do: discard_until_nl(rest)
202
+
190
203
@obsolete_keywords ~w(msgid msgid_plural msgctxt msgstr)a
191
204
192
205
# Collapse the string into the previous str_lines token if there is one *on the same line*.
changed mix.exs
 
@@ -2,7 +2,7 @@
2
2
defmodule Expo.MixProject do
3
3
use Mix.Project
4
4
5
- @version "1.0.1"
5
+ @version "1.1.0"
6
6
@source_url "https://github.com/elixir-gettext/expo"
7
7
@description "Low-level Gettext file handling (.po/.pot/.mo file writer and parser)."