XPathでmixiのHTMLから本文を取り出す

XPathの前に「いろいろ」やるけど。

mixiのHTMLというか、backup_mixiが出力したhtml。ここから本文・タイトル・日付を取り出す。(ついでにはてなダイアリー仕様のXMLに出力)


mixiのhtml、古いと思うな。Webにオープンじゃないからいいのかな。



#!/usr/bin/env ruby

base_dir = "."
output = "hatena.xml"


require 'rexml/document'
require 'rexml/xpath'
require 'iconv'


s2u = Iconv.new('UTF-8', 'Shift_JIS')
u2s = Iconv.new('Shift_JIS', 'UTF-8')

def delete_otag!(html, name)
 html.gsub!(/<#{name}[^>]*>/, '')
end

def delete_tag!(html, name)
 html.gsub!(/<\/?#{name}[^>]*\/?>/, '')
end

def simplify_tag!(html, name)
 html.gsub!(/<#{name}[^>]*\/?>/, "<#{name}>")
end


xml = REXML::Document.new("")
diary = xml.root
Dir.foreach("#{base_dir}/diary"){|dir|
 next if dir !~ /\.html$/
 File.open("#{base_dir}/diary/#{dir}") {|fin|
  begin
   html = fin.read

   #day_match = /([0-9]+)#{u2s.iconv("年")}([0-9]+)#{u2s.iconv("月")}([0-9]+)#{u2s.iconv("日")}/.match(html)
   day_match = /([0-9]{4})[^0-9]{2}([0-9]{2})[^0-9]{2}([0-9]{2})[^0-9]{2}/.match(html)
   y = day_match[1]
   m = day_match[2]
   d = day_match[3]

   delete_otag!(html, "img")
   delete_otag!(html, "meta")
   delete_otag!(html, "br")
   delete_tag!(html, "head")
   delete_tag!(html, "body")
   delete_tag!(html, "table")
   delete_tag!(html, "td")
   delete_tag!(html, "tr")
   delete_tag!(html, "font")
   delete_tag!(html, "a")
   delete_tag!(html, "b")
   delete_tag!(html, "link")
   delete_tag!(html, "form")
   simplify_tag!(html, "html")
   simplify_tag!(html, "title")

   mixi = REXML::Document.new(html)

   mixi_body = REXML::XPath.match(mixi, "/html/div[string(@id)='diary_body']")
   body = mixi_body[0].text

   mixi_title = REXML::XPath.match(mixi, "/html/title")
   title = mixi_title[0].text.gsub!(/\[mixi\] */,'')

   xday = REXML::Element.new("day", diary)
   xday.attributes["date"] = "#{y}-#{m}-#{d}"
   xday.attributes["title"] = s2u.iconv(title)

   xbody = REXML::Element.new("body", xday)
   xbody.text = s2u.iconv(body)
  rescue
   puts "convert failure: #{dir}"
   next
  end
 }
}


File.open(output, 'wb') {|file| file << xml.to_s }