# News behavior
# =============
#
# Defines a tag to produce a "latest news" summary based on an XML feed.
#
#
# History
# -------
#
# 2006-08-06 (ADH): Created.
# 2006-08-07 (ADH): It seems the core RSS features are easily sufficient
#                   so fun though Simple-RSS was, it makes sense to use
#                   something that doesn't add extra dependencies. Moved
#                   over to the Ruby RSS parser. Since tags get expanded
#                   before filters run, but there is no way to escape
#                   text filtered by (say) Textile, instead get rid of
#                   characters known to be a problem.
# 2006-08-08 (ADH): Now knows about the prevalent part filter during tag
#                   processing and takes steps to escape the generated
#                   content. Markdown doesn't seem to need it but Textile
#                   is escaped; it turns out there is a '<notextile>' tag
#                   which does the job. Generating HTML from a tag is
#                   still conceptually wrong because of filter operations
#                   but for now I still want to keep this behavior very
#                   simple for its users. There is an RSS behavior which
#                   can be used if a more flexible scheme is required at
#                   the expense of more effort and less clean handling of
#                   empty RSS item fields.

require 'rss'

class TagError < StandardError; end

class NewsBehavior < Behavior::Base
  register 'News'

  description %{
    This behavior provides a 'news' tag which is supplied with a
    fully qualified URL pointing to an XML RSS feed. The feed is
    parsed and the 'latest news' summary generated from it. The URL
    is given in the mandatory 'feed' attribute within the tag.

    The 'headlines' attribute is optional; it defines how many entries
    will be included in the news summary and defaults to '4'.

    The 'dates' attribute is also optional; it says whether or not
    article published or modified dates (if found) will be added in
    small text after each headline. If '0' there are no dates, else
    dates are shown. The default value is '1', to show dates. Dates are
    extracted from the feed's "pubDate", "modified" or "dc_date" fields,
    in that order.

    Finally, an optional 'escape' attribute, defaulting to '1', ensures
    that RSS titles or links cannot be accidentally interpreted as
    Textile data for Textile filtered parts. Setting the attribute to
    '0' disables escaping to allow headlines marked up in Textile to be
    passed through to the Textile parser.

    Note that '<' and '>' characters in RSS item titles will always be
    escaped to HTML entities for security. Links to articles are run
    through URI::Escape() processing and have '~' characters changed to
    '%7E' for similar reasons.

    Example of use:

      <r:news feed="http://my.url/news.xml" headlines="4" dates="0" />
  }

  # We can't cache pages that might change on every fetch.

  def cache_page?
    false
  end

  # The <r:news.../> tag generates code that might fall foul of text filters
  # on the page part. Overriding parse_object lets the behavior extract the
  # filter ID so that the tag processing code which then gets run can be
  # evaluated in the context of a particular prevalent filter.

  def parse_object(object)
    if (object.respond_to?(:filter_id))
      @filter = object.filter_id
    else
      @filter = nil
    end

    super
  end

  # Tag definitions

  define_tags do

    # <r:news feed="feed_url" [headlines="number"] [dates="0|1"] [escape="0|1"] />
    #
    # This behavior provides a 'news' tag which is supplied with a
    # fully qualified URL pointing to an XML RSS feed. The feed is
    # parsed and the 'latest news' summary generated from it. The URL
    # is given in the mandatory 'feed' attribute within the tag.
    #
    # The 'headlines' attribute is optional; it defines how many entries
    # will be included in the news summary and defaults to '4'.
    #
    # The 'dates' attribute is also optional; it says whether or not
    # article published or modified dates (if found) will be added in
    # small text after each headline. If '0' there are no dates, else
    # dates are shown. The default value is '1', to show dates. Dates are
    # extracted from the feed's "pubDate", "modified" or "dc_date" fields,
    # in that order.
    #
    # Finally, an optional 'escape' attribute, defaulting to '1', ensures
    # that RSS titles or links cannot be accidentally interpreted as
    # Textile data for Textile filtered partss. Setting the attribute to
    # '0' disables escaping to allow headlines marked up in Textile to be
    # passed through to the Textile parser.
    #
    # Note that '<' and '>' characters in RSS item titles will always be
    # escaped to HTML entities for security. Links to articles are run
    # through URI::Escape() processing and have '~' characters changed to
    # '%7E' for similar reasons.
    #
    # Example of use:
    #
    #  <r:news feed="http://my.url/news.xml" headlines="4" dates="0" />
    #
    tag 'news' do |tag|
      feed      = tag.attr['feed']
      dates     = (tag.attr['dates']     || '1').to_i
      escape    = (tag.attr['escape']    || '1').to_i
      headlines = (tag.attr['headlines'] || '4').to_i

      raise TagError.new("No feed URL given in `news' tag") if (feed.nil? or feed.empty?)

      # Fetch the feed and parse it.

      rss  = RSS::Parser.parse(feed)
      done = 0
      out  = "<ul>\n"

      # Escape the data for Textile filtered pages if required.

      if (escape != 0 and @filter == 'Textile')
        out = '<notextile>' + out
      end

      # Loop through all items in the feed.

      rss.items.each do |item|

        # The item must have at least a title.

        next if (item.title.nil? or item.title.empty?)

        # If we've got a title, increase the headline count and bail if
        # the limit has been exceeded.

        done += 1
        break if (done > headlines)

        # Add HTML for this item to the output string.

        out << '  <li>'

        # Ensure the title string doesn't contain unsafe characters -
        # RSS feeds can be used maliciously.

        title = item.title.dup
        title.gsub!('<', '&lt;')
        title.gsub!('>', '%gt;')

        # Markdown doesn't process text here anyway, possibly because
        # the HTML list markup seems to stop it from doing so. Don't
        # escape Markdown for now - the code below has been tested and
        # does work though, so it can be introduced later if need be.
        #
        #if (escape != 0 and @filter == 'Markdown')
        #  title.gsub!(/([`*_{}\[\]()#.!])/) { '\\' + $& }
        #end

        # Insert link HTML if a link is present, escaping it and
        # manually converting "~" characters to the "%7E" equivalent.

        unless (item.link.nil? or item.link.empty?)
          link = URI::escape(item.link)
          link.gsub!(/\~/, '%7E')
          out << "<a href=\"#{link}\">#{title}</a>"
        else
          out << "#{title}"
        end

        # Attempt to extract an item publication/modification date.

        time = nil

        if item.respond_to?(:pubDate)
          # Typo blogs, The Register
          time = item.pubDate
        elsif item.respond_to?(:modified)
          # RForum installations, generic
          time = item.modified
        elsif item.respond_to?(:dc_date)
          # SlashDot
          time = item.dc_date
        end

        # Add the date if found and if attributes say to do so, then
        # close the list item.

        out << time.strftime(' <small>(%d-%b-%Y)</small>') if (time.class == Time and dates != 0)
        out << "</li>\n"
      end

      # Close the list, handle Textile escaping if necessary and
      # return the final chunk of data.

      out << "</ul>\n"

      if (escape != 0 and @filter == 'Textile')
        out << '</notextile>'
      end

      out
    end

  end # From 'define_tags do'
end
