XPathでmixiのHTMLから本文を取り出す
XPathの前に「いろいろ」やるけど。
mixiのHTMLというか、backup_mixiが出力したhtml。ここから本文・タイトル・日付を取り出す。(ついでにはてなダイアリー仕様のXMLに出力)
mixiのhtml、古いと思うな。Webにオープンじゃないからいいのかな。
#!/usr/bin/env ruby
base_dir = "."
output = "hatena.xml"
require 'rexml/document'
require 'rexml/xpath'
require 'iconv'
s2u = Iconv.new('UTF-8', 'Shift_JIS')
u2s = Iconv.new('Shift_JIS', 'UTF-8')
def delete_otag!(html, name)
html.gsub!(/<#{name}[^>]*>/, '')
end
def delete_tag!(html, name)
html.gsub!(/<\/?#{name}[^>]*\/?>/, '')
end
def simplify_tag!(html, name)
html.gsub!(/<#{name}[^>]*\/?>/, "<#{name}>")
end
xml = REXML::Document.new("
diary = xml.root
Dir.foreach("#{base_dir}/diary"){|dir|
next if dir !~ /\.html$/
File.open("#{base_dir}/diary/#{dir}") {|fin|
begin
html = fin.read
#day_match = /([0-9]+)#{u2s.iconv("年")}([0-9]+)#{u2s.iconv("月")}([0-9]+)#{u2s.iconv("日")}/.match(html)
day_match = /([0-9]{4})[^0-9]{2}([0-9]{2})[^0-9]{2}([0-9]{2})[^0-9]{2}/.match(html)
y = day_match[1]
m = day_match[2]
d = day_match[3]
delete_otag!(html, "img")
delete_otag!(html, "meta")
delete_otag!(html, "br")
delete_tag!(html, "head")
delete_tag!(html, "body")
delete_tag!(html, "table")
delete_tag!(html, "td")
delete_tag!(html, "tr")
delete_tag!(html, "font")
delete_tag!(html, "a")
delete_tag!(html, "b")
delete_tag!(html, "link")
delete_tag!(html, "form")
simplify_tag!(html, "html")
simplify_tag!(html, "title")
mixi = REXML::Document.new(html)
mixi_body = REXML::XPath.match(mixi, "/html/div[string(@id)='diary_body']")
body = mixi_body[0].text
mixi_title = REXML::XPath.match(mixi, "/html/title")
title = mixi_title[0].text.gsub!(/\[mixi\] */,'')
xday = REXML::Element.new("day", diary)
xday.attributes["date"] = "#{y}-#{m}-#{d}"
xday.attributes["title"] = s2u.iconv(title)
xbody = REXML::Element.new("body", xday)
xbody.text = s2u.iconv(body)
rescue
puts "convert failure: #{dir}"
next
end
}
}
File.open(output, 'wb') {|file| file << xml.to_s }