2009年2月8日星期日

帮人搬家之导入Blogger

在跟我实在不熟的Python奋斗了若干时间后,我才悲哀地发现,Blogger现在对API导入添加了限制。每天通过API发布一定量的帖子后,再发就得输入验证码,我太阳。

还好,通过尝试发现,Blogger GUI中提供的导入功能可以用。不然的话,某人的小500张帖子,还不知道要弄到什么时候去。

用Javascript写这段代码,看起来就乱七八糟的。不过没办法,还是因为Ruby没有好用的HTMLParser。虽然可以在Ruby里可以调用ActiveX COM,但是会有回车换行符混乱的问题,将就用吧。

// 这个函数用来处理内文需要修改的地方
// 如果没什么可改的那就不需要调用
function parseHTML(src) {
  var doc = new ActiveXObject('htmlfile');
  doc.write(src);
  
  var es = doc.getElementsByTagName('img');
  for (var i=0;i<es.length;i++) {
    var s = es[i].src;
    if (s.indexOf('foto.ycstatic.com')>0) {
      es[i].src = imgmap[encodeURIComponent(s)];
    }
  }
  return doc.body.innerHTML;
}

function parseXML(path) {
  var xml = new ActiveXObject('MSXML2.DOMDocument.3.0');
  xml.load(path);
  
  var ret = "<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href='http://www.blogger.com/styles/atom.css' type='text/css'?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-7617775710611731452.archive</id><updated>2009-02-08T18:10:51.127+08:00</updated><title type='text'>幺贰和叁</title><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://otnth.blogspot.com/feeds/archive'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/7617775710611731452/archive'/><link rel='http://schemas.google.com/g/2005#post' type='application/atom+xml' href='http://www.blogger.com/feeds/7617775710611731452/archive'/><link rel='alternate' type='text/html' href='http://otnth.blogspot.com/'/><author><name>小八</name><email>noreply@blogger.com</email></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator>";
  var es = xml.getElementsByTagName('feed/entry');
  for (var i=0;i<es.length;i++) {
    ret += '<entry><id>' + es[i].getElementsByTagName('id')[0].firstChild.nodeValue + '</id>';
    
    ret += '<published>' + es[i].getElementsByTagName('published')[0].firstChild.nodeValue + '</published>';
    ret += '<updated>' + es[i].getElementsByTagName('updated')[0].firstChild.nodeValue + '</updated>';
    
    ret += "<category scheme='http://schemas.google.com/g/2005#kind' term='http://schemas.google.com/blogger/2008/kind#post'/>";
    var cats = es[i].getElementsByTagName('category');
    for (var j=0;j<cats.length;j++) {
      if (cats[j].getAttribute('scheme') != 'http://www.google.com/reader/') {
        ret += "<category scheme='http://www.blogger.com/atom/ns#' term='" + cats[j].getAttribute('term') + "'/>";
      }
    }

    ret += '<title type="text">' + es[i].getElementsByTagName('title')[0].firstChild.nodeValue + '</title>';

    s = '<div class="oldpost_ycool">' + es[i].getElementsByTagName('summary')[0].firstChild.nodeValue.replace(/\.{3}$/, '')  + '</div>';
    //s = parseHTML(s);
    ret += '<content type="html"><![CDATA[' + s + ']]></content>';
    
    ret += "<author><name>小八</name></author>";
    
    ret += "<thr:total>0</thr:total></entry>";
  }
  ret += '</feed>';
  
  var xn = new ActiveXObject('MSXML2.DOMDocument.3.0');
  xn.async = false;
  xn.loadXML(ret);
  
  if (xn.parseError != 0) {
    var oError = xn.parseError;
    throw new Error("An error occurred:\n错误代码: "
      + oError.errorCode + "\n"
      + "行数: " + oError.line + "\n"
      + "列数: " + oError.linepos + "\n"
      + "原因: " + oError.reason);
  } else {
    xn.save(path + '.txt');
  }
}


function main() {
  var fso = new ActiveXObject('Scripting.FileSystemObject');
  var fd = fso.GetFolder('.');
  var fc = new Enumerator(fd.Files);
  for (;!fc.atEnd();fc.moveNext()) {
    var s = String(fc.item());
    if (s.substr(s.length-4) == '.xml') {
      parseXML(s);
    }
  }
}

main();

没有评论 :