From 125252a38900c15d62a79b2ed4f20b8950eaaff4 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 15 Jul 2015 01:03:41 -0300 Subject: [PATCH 1/3] Fix typo --- lib/floki/deep_text.ex | 4 ++-- lib/floki/flat_text.ex | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/floki/deep_text.ex b/lib/floki/deep_text.ex index e858dc1c..1da0739f 100644 --- a/lib/floki/deep_text.ex +++ b/lib/floki/deep_text.ex @@ -14,8 +14,8 @@ defmodule Floki.DeepText do ## Examples - iex> Floki.DeepText.get([{"a", [], ["The mean of life is...", {"strong", [], ["something else"]}] }]) - "The mean of life is...something else" + iex> Floki.DeepText.get([{"a", [], ["The meaning of life is...", {"strong", [], ["something else"]}] }]) + "The meaning of life is...something else" """ def get(html_tree) do diff --git a/lib/floki/flat_text.ex b/lib/floki/flat_text.ex index 0041d458..58370f89 100644 --- a/lib/floki/flat_text.ex +++ b/lib/floki/flat_text.ex @@ -14,8 +14,8 @@ defmodule Floki.FlatText do ## Examples - iex> Floki.FlatText.get([{"a", [], ["The mean of life is...", {"strong", [], ["something else"]}] }]) - "The mean of life is..." + iex> Floki.FlatText.get([{"a", [], ["The meaning of life is...", {"strong", [], ["something else"]}] }]) + "The meaning of life is..." """ def get(html_nodes) when is_list(html_nodes) do From 4cde3f4628277053b29286c4c0be70b9d062b480 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Sun, 23 Aug 2015 01:11:34 -0300 Subject: [PATCH 2/3] Fix `Floki.find/2` when there is a non-HTML input This commit closes the issue #17. It also includes a refactor to organize the code. It only moves related things to its modules. --- CHANGELOG.md | 3 + lib/floki.ex | 211 +++++------------------------------------- lib/floki/finder.ex | 102 ++++++++++++++++++++ lib/floki/matchers.ex | 78 ++++++++++++++++ lib/floki/parser.ex | 11 +++ mix.lock | 6 +- test/floki_test.exs | 8 ++ 7 files changed, 226 insertions(+), 193 deletions(-) create mode 100644 lib/floki/finder.ex create mode 100644 lib/floki/matchers.ex create mode 100644 lib/floki/parser.ex diff --git a/CHANGELOG.md b/CHANGELOG.md index 01b73579..5a486839 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ This project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased][unreleased] +- Fix `Floki.find/2` when there is a non-HTML input. +It closes the [issue #17](https://github.com/philss/floki/issues/17) + ## [0.3.2] - 2015-06-27 ### Fixed diff --git a/lib/floki.ex b/lib/floki.ex index ef7db12e..0aeea34b 100644 --- a/lib/floki.ex +++ b/lib/floki.ex @@ -1,4 +1,7 @@ defmodule Floki do + alias Floki.Finder + alias Floki.Parser + @moduledoc """ A HTML parser and seeker. @@ -10,16 +13,18 @@ defmodule Floki do Assuming that you have the following HTML: - - - -
-

Floki

- Github page - philss -
- - + ```html + + + +
+

Floki

+ Github page + philss +
+ + + ``` You can perform the following queries: @@ -66,15 +71,10 @@ defmodule Floki do """ - @floki_root_node "floki" - @spec parse(binary) :: html_tree def parse(html) do - html = "<#{@floki_root_node}>#{html}" - {@floki_root_node, [], parsed} = :mochiweb_html.parse(html) - - if length(parsed) == 1, do: hd(parsed), else: parsed + Parser.parse(html) end @doc """ @@ -101,66 +101,8 @@ defmodule Floki do @spec find(binary | html_tree, binary) :: html_tree - def find(html, selector) when is_binary(html) do - html_tree = parse(html) - - find(html_tree, selector) - end - - def find(html_tree, selector) when is_tuple(selector) do - {:ok, nodes} = find_by_selector(selector, html_tree, &attr_matcher/3, {:ok, []}) - - Enum.reverse(nodes) - end - - def find(html_tree, selector) do - tag_attr_val_regex = ~r/(?'tag'.+)\[(?'attr'.+)=(?'val'.+)\]/ - attr_val_regex = ~r/\[(?'attr'.+)=(?'val'.+)\]/ - - cond do - String.contains?(selector, ",") -> - selectors = String.split(selector, ",") - - Enum.reduce selectors, [], fn(selector, acc) -> - selector = String.strip(selector) - - nodes = find(html_tree, selector) - - unless is_list(nodes), do: nodes = [nodes] - - Enum.concat(acc, nodes) - end - String.contains?(selector, "\s") -> - descendent_selector = String.split(selector) - - Enum.reduce descendent_selector, html_tree, fn(selector, tree) -> - find(tree, selector) - end - String.starts_with?(selector, ".") -> - "." <> class = selector - {:ok, nodes} = find_by_selector(class, html_tree, &class_matcher/3, {:ok, []}) - - Enum.reverse(nodes) - String.starts_with?(selector, "#") -> - "#" <> id = selector - {_status, nodes} = find_by_selector(id, html_tree, &id_matcher/3, {:ok, []}) - - List.first(nodes) - Regex.match?(attr_val_regex, selector) -> - %{"attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector) - {:ok, nodes} = find_by_selector({attr, val}, html_tree, &attr_matcher/3, {:ok, []}) - - Enum.reverse(nodes) - Regex.match?(tag_attr_val_regex, selector) -> - %{"tag" => tag, "attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector) - {:ok, nodes} = find_by_selector({tag, attr, val}, html_tree, &attr_matcher/3, {:ok, []}) - - Enum.reverse(nodes) - true -> - {:ok, nodes} = find_by_selector(selector, html_tree, &tag_matcher/3, {:ok, []}) - - Enum.reverse(nodes) - end + def find(html, selector) do + Finder.find(html, selector) end @doc """ @@ -178,7 +120,7 @@ defmodule Floki do def attribute(html, selector, attribute_name) do html |> find(selector) - |> attribute_values(attribute_name) + |> Finder.attribute_values(attribute_name) end @doc """ @@ -196,11 +138,10 @@ defmodule Floki do def attribute(html_tree, attribute_name) when is_binary(html_tree) do html_tree |> parse - |> attribute(attribute_name) + |> Finder.attribute_values(attribute_name) end def attribute(elements, attribute_name) do - elements - |> attribute_values(attribute_name) + Finder.attribute_values(elements, attribute_name) end @doc """ @@ -235,114 +176,4 @@ defmodule Floki do search_strategy.get(html_tree) end - - defp attribute_match?(attributes, attribute_name) do - Enum.find attributes, fn({attr_name, _}) -> - attr_name == attribute_name - end - end - - defp attribute_match?(attributes, attribute_name, selector_value) do - Enum.find attributes, fn(attribute) -> - {attr_name, attr_value} = attribute - - attr_name == attribute_name && value_match?(attr_value, selector_value) - end - end - - defp find_by_selector(_selector, {}, _, acc), do: acc - defp find_by_selector(_selector, [], _, acc), do: acc - defp find_by_selector(_selector, _, _, {:done, nodes}), do: {:done, nodes} - defp find_by_selector(_selector, tree, _, acc) when is_binary(tree), do: acc - defp find_by_selector(selector, [h|t], matcher, acc) do - acc = find_by_selector(selector, h, matcher, acc) - find_by_selector(selector, t, matcher, acc) - end - # Ignore comments - defp find_by_selector(_selector, {:comment, _comment}, _, acc), do: acc - # Ignore XML document version - defp find_by_selector(_selector, {:pi, _xml, _xml_attrs}, _, acc), do: acc - defp find_by_selector(selector, node, matcher, acc) do - {_, _, child_node} = node - - acc = matcher.(selector, node, acc) - - find_by_selector(selector, child_node, matcher, acc) - end - - defp attribute_values(element, attr_name) when is_tuple(element) do - attribute_values([element], attr_name) - end - defp attribute_values(elements, attr_name) do - values = Enum.reduce elements, [], fn({_, attributes, _}, acc) -> - case attribute_match?(attributes, attr_name) do - {_attr_name, value} -> - [value|acc] - _ -> - acc - end - end - - Enum.reverse(values) - end - - defp attr_matcher({attr, value}, node, acc) do - {_, attributes, _} = node - {:ok, acc_nodes} = acc - - if attribute_match?(attributes, attr, value) do - acc = {:ok, [node|acc_nodes]} - end - - acc - end - defp attr_matcher({tag_name, attr, value}, node, acc) do - {tag, attributes, _} = node - {:ok, acc_nodes} = acc - - if tag == tag_name and attribute_match?(attributes, attr, value) do - acc = {:ok, [node|acc_nodes]} - end - - acc - end - - defp class_matcher(class_name, node, acc) do - {_, attributes, _} = node - {:ok, acc_nodes} = acc - - if attribute_match?(attributes, "class", class_name) do - acc = {:ok, [node|acc_nodes]} - end - - acc - end - - defp tag_matcher(tag_name, node, acc) do - {tag, _, _} = node - {:ok, acc_nodes} = acc - - if tag == tag_name do - acc = {:ok, [node|acc_nodes]} - end - - acc - end - - defp id_matcher(id, node, acc) do - {_, attributes, _} = node - {:ok, acc_nodes} = acc - - if attribute_match?(attributes, "id", id) do - acc = {:done, [node|acc_nodes]} - end - - acc - end - - defp value_match?(attribute_value, selector_value) do - attribute_value - |> String.split - |> Enum.any?(fn(x) -> x == selector_value end) - end end diff --git a/lib/floki/finder.ex b/lib/floki/finder.ex new file mode 100644 index 00000000..20d4390b --- /dev/null +++ b/lib/floki/finder.ex @@ -0,0 +1,102 @@ +defmodule Floki.Finder do + @moduledoc false + + import Floki.Matchers + + def find(html, selector) when is_binary(html) do + Floki.Parser.parse(html) |> do_find(selector) + end + + def find(html_tree, selector), do: do_find(html_tree, selector) + + def attribute_values(element, attr_name) when is_tuple(element) do + attribute_values([element], attr_name) + end + def attribute_values(elements, attr_name) do + values = Enum.reduce elements, [], fn({_, attributes, _}, acc) -> + case attribute_match?(attributes, attr_name) do + {_attr_name, value} -> + [value|acc] + _ -> + acc + end + end + + Enum.reverse(values) + end + + defp do_find(html_tree, selector) when is_tuple(selector) do + {:ok, nodes} = find_by_selector(selector, html_tree, &attr_matcher/3, {:ok, []}) + Enum.reverse(nodes) + end + + defp do_find(html_tree, selector) do + tag_attr_val_regex = ~r/(?'tag'.+)\[(?'attr'.+)=(?'val'.+)\]/ + attr_val_regex = ~r/\[(?'attr'.+)=(?'val'.+)\]/ + + cond do + String.contains?(selector, ",") -> + selectors = String.split(selector, ",") + + Enum.reduce selectors, [], fn(selector, acc) -> + selector = String.strip(selector) + + nodes = do_find(html_tree, selector) + + unless is_list(nodes), do: nodes = [nodes] + + Enum.concat(acc, nodes) + end + String.contains?(selector, "\s") -> + descendent_selector = String.split(selector) + + Enum.reduce descendent_selector, html_tree, fn(selector, tree) -> + do_find(tree, selector) + end + String.starts_with?(selector, ".") -> + "." <> class = selector + {:ok, nodes} = find_by_selector(class, html_tree, &class_matcher/3, {:ok, []}) + + Enum.reverse(nodes) + String.starts_with?(selector, "#") -> + "#" <> id = selector + {_status, nodes} = find_by_selector(id, html_tree, &id_matcher/3, {:ok, []}) + + List.first(nodes) + Regex.match?(attr_val_regex, selector) -> + %{"attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector) + {:ok, nodes} = find_by_selector({attr, val}, html_tree, &attr_matcher/3, {:ok, []}) + + Enum.reverse(nodes) + Regex.match?(tag_attr_val_regex, selector) -> + %{"tag" => tag, "attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector) + {:ok, nodes} = find_by_selector({tag, attr, val}, html_tree, &attr_matcher/3, {:ok, []}) + + Enum.reverse(nodes) + true -> + {:ok, nodes} = find_by_selector(selector, html_tree, &tag_matcher/3, {:ok, []}) + + Enum.reverse(nodes) + end + end + + defp find_by_selector(_selector, {}, _, acc), do: acc + defp find_by_selector(_selector, [], _, acc), do: acc + defp find_by_selector(_selector, _, _, {:done, nodes}), do: {:done, nodes} + defp find_by_selector(_selector, tree, _, acc) when is_binary(tree), do: acc + defp find_by_selector(selector, [h|t], matcher, acc) do + acc = find_by_selector(selector, h, matcher, acc) + find_by_selector(selector, t, matcher, acc) + end + # Ignore comments + defp find_by_selector(_selector, {:comment, _comment}, _, acc), do: acc + # Ignore XML document version + defp find_by_selector(_selector, {:pi, _xml, _xml_attrs}, _, acc), do: acc + defp find_by_selector(selector, node, matcher, acc) do + {_, _, child_node} = node + + acc = matcher.(selector, node, acc) + + find_by_selector(selector, child_node, matcher, acc) + end +end diff --git a/lib/floki/matchers.ex b/lib/floki/matchers.ex new file mode 100644 index 00000000..42920896 --- /dev/null +++ b/lib/floki/matchers.ex @@ -0,0 +1,78 @@ +defmodule Floki.Matchers do + @moduledoc false + + def attr_matcher({attr, value}, node, acc) do + {_, attributes, _} = node + {:ok, acc_nodes} = acc + + if attribute_match?(attributes, attr, value) do + acc = {:ok, [node|acc_nodes]} + end + + acc + end + + def attr_matcher({tag_name, attr, value}, node, acc) do + {tag, attributes, _} = node + {:ok, acc_nodes} = acc + + if tag == tag_name and attribute_match?(attributes, attr, value) do + acc = {:ok, [node|acc_nodes]} + end + + acc + end + + def class_matcher(class_name, node, acc) do + {_, attributes, _} = node + {:ok, acc_nodes} = acc + + if attribute_match?(attributes, "class", class_name) do + acc = {:ok, [node|acc_nodes]} + end + + acc + end + + def tag_matcher(tag_name, node, acc) do + {tag, _, _} = node + {:ok, acc_nodes} = acc + + if tag == tag_name do + acc = {:ok, [node|acc_nodes]} + end + + acc + end + + def id_matcher(id, node, acc) do + {_, attributes, _} = node + {:ok, acc_nodes} = acc + + if attribute_match?(attributes, "id", id) do + acc = {:done, [node|acc_nodes]} + end + + acc + end + + def value_match?(attribute_value, selector_value) do + attribute_value + |> String.split + |> Enum.any?(fn(x) -> x == selector_value end) + end + + def attribute_match?(attributes, attribute_name) do + Enum.find attributes, fn({attr_name, _}) -> + attr_name == attribute_name + end + end + + def attribute_match?(attributes, attribute_name, selector_value) do + Enum.find attributes, fn(attribute) -> + {attr_name, attr_value} = attribute + + attr_name == attribute_name && value_match?(attr_value, selector_value) + end + end +end diff --git a/lib/floki/parser.ex b/lib/floki/parser.ex new file mode 100644 index 00000000..879bf3d7 --- /dev/null +++ b/lib/floki/parser.ex @@ -0,0 +1,11 @@ +defmodule Floki.Parser do + @moduledoc false + @floki_root_node "floki" + + def parse(html) do + html = "<#{@floki_root_node}>#{html}" + {@floki_root_node, [], parsed} = :mochiweb_html.parse(html) + + if length(parsed) == 1, do: hd(parsed), else: parsed + end +end diff --git a/mix.lock b/mix.lock index 22ee8e04..a392dc37 100644 --- a/mix.lock +++ b/mix.lock @@ -1,5 +1,5 @@ -%{"earmark": {:hex, :earmark, "0.1.15"}, - "ex_doc": {:hex, :ex_doc, "0.7.2"}, - "inch_ex": {:hex, :inch_ex, "0.2.4"}, +%{"earmark": {:hex, :earmark, "0.1.17"}, + "ex_doc": {:hex, :ex_doc, "0.8.4"}, + "inch_ex": {:hex, :inch_ex, "0.4.0"}, "mochiweb": {:hex, :mochiweb, "2.12.2"}, "poison": {:hex, :poison, "1.4.0"}} diff --git a/test/floki_test.exs b/test/floki_test.exs index 13185b6c..c9e04423 100644 --- a/test/floki_test.exs +++ b/test/floki_test.exs @@ -285,4 +285,12 @@ defmodule FlokiTest do assert Floki.find(@xml, "title") == expected end + + @tag timeout: 1000 + test "find an inexistent element inside a invalid HTML" do + assert Floki.find("something", "a") == [] + assert Floki.find("", "a") == [] + assert Floki.find("foobar", "a") == [] + assert Floki.find("foobar Date: Sun, 23 Aug 2015 01:27:01 -0300 Subject: [PATCH 3/3] Bump version to 0.3.3 --- CHANGELOG.md | 7 ++++++- mix.exs | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a486839..89b9f6db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ This project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased][unreleased] +## [0.3.3] - 2015-08-23 + +### Fixed + - Fix `Floki.find/2` when there is a non-HTML input. It closes the [issue #17](https://github.com/philss/floki/issues/17) @@ -75,7 +79,8 @@ of the parent element inside HTML. - Elixir version requirement from "~> 1.0.0" to ">= 1.0.0". -[unreleased]: https://github.com/philss/floki/compare/v0.3.2...HEAD +[unreleased]: https://github.com/philss/floki/compare/v0.3.3...HEAD +[0.3.2]: https://github.com/philss/floki/compare/v0.3.2...v0.3.3 [0.3.2]: https://github.com/philss/floki/compare/v0.3.1...v0.3.2 [0.3.1]: https://github.com/philss/floki/compare/v0.3.0...v0.3.1 [0.3.0]: https://github.com/philss/floki/compare/v0.2.1...v0.3.0 diff --git a/mix.exs b/mix.exs index 4b70db18..00c375b0 100644 --- a/mix.exs +++ b/mix.exs @@ -3,7 +3,7 @@ defmodule Floki.Mixfile do def project do [app: :floki, - version: "0.3.2", + version: "0.3.3", elixir: ">= 1.0.0", package: package, description: description,