expo 1.0.1..1.1.0

changed CHANGELOG.md

	@@ -1,5 +1,9 @@
1 1	# Changelog
2 2
3	+ ## v1.1.0
4	+
5	+ * Add PO Parser Metadata Stripping
6	+
3 7	## v1.0.1
4 8
5 9	* Fix backslash escaping in `Expo.PO.compose/1`

changed hex_metadata.config

	@@ -4,7 +4,7 @@
4 4	{<<"GitHub">>,<<"https://github.com/elixir-gettext/expo">>},
5 5	{<<"Issues">>,<<"https://github.com/elixir-gettext/expo/issues">>}]}.
6 6	{<<"name">>,<<"expo">>}.
7	- {<<"version">>,<<"1.0.1">>}.
7	+ {<<"version">>,<<"1.1.0">>}.
8 8	{<<"description">>,
9 9	<<"Low-level Gettext file handling (.po/.pot/.mo file writer and parser).">>}.
10 10	{<<"elixir">>,<<"~> 1.11">>}.

changed lib/expo/po.ex

	@@ -6,7 +6,17 @@ defmodule Expo.PO do
6 6	alias Expo.Messages
7 7	alias Expo.PO.{DuplicateMessagesError, Parser, SyntaxError}
8 8
9	- @type parse_option :: {:file, Path.t()}
9	+ @typedoc """
10	+ Parsing option.
11	+
12	+ * `:file` (`t:Path.t/0`) - path to use in error messages when using `parse_string/2`. If not present, errors
13	+ don't have a path.
14	+
15	+ * `:strip_meta` (`t:boolean/0`) - include only messages (no comments and other metadata) from the `.po` file
16	+ to reduce memory usage when meta information is not needed.
17	+ Defaults to `false`.
18	+ """
19	+ @type parse_option :: {:file, Path.t()} \| {:strip_meta, boolean()}
10 20
11 21	@doc """
12 22	Dumps a `Expo.Messages` struct as iodata.

changed lib/expo/po/parser.ex

	@@ -12,7 +12,7 @@ defmodule Expo.PO.Parser do
12 12	def parse(content, opts) do
13 13	content = prune_bom(content, Keyword.get(opts, :file, "nofile"))
14 14
15	- with {:ok, tokens} <- tokenize(content),
15	+ with {:ok, tokens} <- tokenize(content, opts),
16 16	{:ok, po} <- parse_tokens(tokens),
17 17	{:ok, po} <- check_for_duplicates(po) do
18 18	{:ok, %Messages{po \| file: Keyword.get(opts, :file)}}
	@@ -22,8 +22,8 @@ defmodule Expo.PO.Parser do
22 22	end
23 23	end
24 24
25	- defp tokenize(content) do
26	- case Tokenizer.tokenize(content) do
25	+ defp tokenize(content, opts) do
26	+ case Tokenizer.tokenize(content, opts) do
27 27	{:ok, tokens} -> {:ok, tokens}
28 28	{:error, line, message} -> {:error, %SyntaxError{line: line, reason: message}}
29 29	end

changed lib/expo/po/tokenizer.ex

	@@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do
52 52	* `{:str, 6, "foo"}`
53 53
54 54	"""
55	- @spec tokenize(binary) :: {:ok, [token]} \| {:error, pos_integer, binary}
56	- def tokenize(str) do
57	- tokenize_line(str, _line = 1, _tokens_acc = [])
55	+ @spec tokenize(binary, [Expo.PO.parse_option()]) ::
56	+ {:ok, [token]} \| {:error, pos_integer, binary}
57	+ def tokenize(str, opts \\ []) do
58	+ strip_meta? = Keyword.get(opts, :strip_meta, false)
59	+ tokenize_line(str, _line = 1, strip_meta?, _tokens_acc = [])
58 60	end
59 61
60 62	# Reverse str_lines strings.
	@@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do
86 88	end
87 89
88 90	# End of file.
89	- defp tokenize_line(<<>>, line, acc) do
91	+ defp tokenize_line(<<>>, line, _strip_meta?, acc) do
90 92	{:ok, [{:"$end", line} \| acc] \|> Enum.reverse() \|> postprocess_tokens()}
91 93	end
92 94
93 95	# Go to the next line.
94	- defp tokenize_line(<<?\n, rest::binary>>, line, acc) do
95	- tokenize_line(rest, line + 1, acc)
96	+ defp tokenize_line(<<?\n, rest::binary>>, line, strip_meta?, acc) do
97	+ tokenize_line(rest, line + 1, strip_meta?, acc)
96 98	end
97 99
98 100	# Skip other whitespace.
99	- defp tokenize_line(<<char, rest::binary>>, line, acc)
101	+ defp tokenize_line(<<char, rest::binary>>, line, strip_meta?, acc)
100 102	when char in @whitespace_no_nl do
101	- tokenize_line(rest, line, acc)
103	+ tokenize_line(rest, line, strip_meta?, acc)
104	+ end
105	+
106	+ # Skip Meta Information when strip_meta is enabled
107	+ defp tokenize_line(<<?#, rest::binary>>, line, true, acc) do
108	+ from_next_line = discard_until_nl(rest)
109	+ tokenize_line(from_next_line, line, true, acc)
102 110	end
103 111
104 112	# Obsolete comment.
105	- defp tokenize_line(<<"#~", rest::binary>>, line, acc) do
106	- tokenize_line(rest, line, [{:obsolete, line} \| acc])
113	+ defp tokenize_line(<<"#~", rest::binary>>, line, strip_meta?, acc) do
114	+ tokenize_line(rest, line, strip_meta?, [{:obsolete, line} \| acc])
107 115	end
108 116
109 117	# Previous comment.
110	- defp tokenize_line(<<"#\|", rest::binary>>, line, acc) do
111	- tokenize_line(rest, line, [{:previous, line} \| acc])
118	+ defp tokenize_line(<<"#\|", rest::binary>>, line, strip_meta?, acc) do
119	+ tokenize_line(rest, line, strip_meta?, [{:previous, line} \| acc])
112 120	end
113 121
114 122	# Normal comment.
115	- defp tokenize_line(<<?#, _rest::binary>> = rest, line, acc) do
123	+ defp tokenize_line(<<?#, _rest::binary>> = rest, line, strip_meta?, acc) do
116 124	{contents, rest} = to_eol_or_eof(rest, "")
117	- tokenize_line(rest, line, [{:comment, line, contents} \| acc])
125	+ tokenize_line(rest, line, strip_meta?, [{:comment, line, contents} \| acc])
118 126	end
119 127
120 128	# Keywords.
121 129	for kw <- @string_keywords do
122	- defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, acc)
130	+ defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, strip_meta?, acc)
123 131	when char in @whitespace do
124 132	acc = [{unquote(String.to_existing_atom(kw)), line} \| acc]
125	- tokenize_line(rest, line, acc)
133	+ tokenize_line(rest, line, strip_meta?, acc)
126 134	end
127 135
128	- defp tokenize_line(unquote(kw) <> _rest, line, _acc) do
136	+ defp tokenize_line(unquote(kw) <> _rest, line, _strip_meta?, _acc) do
129 137	{:error, line, "no space after '#{unquote(kw)}'"}
130 138	end
131 139	end
132 140
133 141	# `msgstr`.
134	- defp tokenize_line("msgstr[" <> <<rest::binary>>, line, acc) do
142	+ defp tokenize_line("msgstr[" <> <<rest::binary>>, line, strip_meta?, acc) do
135 143	case tokenize_plural_form(rest, "") do
136 144	{:ok, plural_form, rest} ->
137 145	# The order of the :plural_form and :msgstr tokens is inverted since
138 146	# the `acc` array of tokens will be reversed at the end.
139 147	acc = [{:plural_form, line, plural_form}, {:msgstr, line} \| acc]
140	- tokenize_line(rest, line, acc)
148	+ tokenize_line(rest, line, strip_meta?, acc)
141 149
142 150	{:error, reason} ->
143 151	{:error, line, reason}
144 152	end
145 153	end
146 154
147	- defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, acc)
155	+ defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, strip_meta?, acc)
148 156	when char in @whitespace do
149 157	acc = [{:msgstr, line} \| acc]
150	- tokenize_line(rest, line, acc)
158	+ tokenize_line(rest, line, strip_meta?, acc)
151 159	end
152 160
153	- defp tokenize_line("msgstr" <> _rest, line, _acc) do
161	+ defp tokenize_line("msgstr" <> _rest, line, _strip_meta?, _acc) do
154 162	{:error, line, "no space after 'msgstr'"}
155 163	end
156 164
157 165	# String.
158	- defp tokenize_line(<<?", rest::binary>>, line, acc) do
166	+ defp tokenize_line(<<?", rest::binary>>, line, strip_meta?, acc) do
159 167	case tokenize_string(rest, "") do
160 168	{:ok, string, rest} ->
161	- tokenize_line(rest, line, add_str_lines(line, string, acc))
169	+ tokenize_line(rest, line, strip_meta?, add_str_lines(line, string, acc))
162 170
163 171	{:error, reason} ->
164 172	{:error, line, reason}
	@@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do
170 178	# a letter (we don't take care of unicode or fancy stuff, just ASCII letters),
171 179	# we assume there's an unknown keyword. We parse it with a regex
172 180	# so that the error message is informative.
173	- defp tokenize_line(<<letter, _rest::binary>> = binary, line, _acc)
181	+ defp tokenize_line(<<letter, _rest::binary>> = binary, line, _strip_meta?, _acc)
174 182	when letter in ?a..?z or letter in ?A..?Z do
175 183	next_word = List.first(Regex.run(~r/\w+/u, binary))
176 184	{:error, line, "unknown keyword '#{next_word}'"}
	@@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do
180 188	# Last resort: this is just a plain unexpected token. We take the first
181 189	# Unicode char of the given binary and build an informative error message
182 190	# (with the codepoint of the char).
183	- defp tokenize_line(binary, line, _acc) when is_binary(binary) do
191	+ defp tokenize_line(binary, line, _strip_meta?, _acc) when is_binary(binary) do
184 192	# To get the first Unicode char, we convert to char list first.
185 193	[char \| _] = String.to_charlist(binary)
186 194	msg = :io_lib.format(~c"unexpected token: \"~ts\" (codepoint U+~4.16.0B)", [[char], char])
187 195	{:error, line, :unicode.characters_to_binary(msg)}
188 196	end
189 197
198	+ defp discard_until_nl(content)
199	+ defp discard_until_nl(<<?\n, _rest::binary>> = content), do: content
200	+ defp discard_until_nl(<<>>), do: <<>>
201	+ defp discard_until_nl(<<_char, rest::binary>>), do: discard_until_nl(rest)
202	+
190 203	@obsolete_keywords ~w(msgid msgid_plural msgctxt msgstr)a
191 204
192 205	# Collapse the string into the previous str_lines token if there is one on the same line.

changed mix.exs

	@@ -2,7 +2,7 @@
2 2	defmodule Expo.MixProject do
3 3	use Mix.Project
4 4
5	- @version "1.0.1"
5	+ @version "1.1.0"
6 6	@source_url "https://github.com/elixir-gettext/expo"
7 7	@description "Low-level Gettext file handling (.po/.pot/.mo file writer and parser)."