抓取QQ空间里某个号码的日志，python的乱码是个问题啊

# -*- coding: utf-8 -*-

from HttpRequestModule import *

import os
import json
import traceback

import codecs
from lxml import etree
import StringIO, gzip
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def write_file(file_name,file_data,encoding):
    if len(file_data) == 0 :
        print "file_data is zero"
        return
    file_dir = r"D:\fs\test_data\qqzone"
    file_path=os.path.join(file_dir,file_name)
    print file_path
#    fp=open(file_path,"w")
#    fp.write(file_data)
#    fp.flush()
#    fp.close()
    with codecs.open(file_path,"w",encoding) as f:
        f.write(file_data)

def decodeJson(json_string):
    decode_json=None
    try:
        decode_json=json.loads(json_string)
        return decode_json
    except (TypeError, ValueError) as err:
        print( 'TypeError or ValueError:{0}'.format(err) )
    except  Exception,e:
        print( traceback.format_exc() )
        pass
    return decode_json

def getUserBlogList():
    blog_list=[]
    diray_url='''
    http://b1.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=859226880&blogType=0&cateName=&cateHex=&statYear=2015&reqInfo=7&pos=0&num=15&sortType=0&absType=0&source=0&rand=0.6346770680975169&ref=qzone&g_tk=1611717761&verbose=1
    '''
    data=doGet(diray_url)
    data_len = len(data)
    if data_len == 0 :
        print "data len is 0"
        return blog_list
    data_json = data[10:data_len-2]
    #write_file('bloglist.txt',data_json,'utf-8')
    decode_json=decodeJson(data_json.decode("gbk"))
    if decode_json == None :
        print "decode_json is None"
        return []
    if decode_json['code'] != 0:
        print "server response code is "+decode_json['code']
        return []
    data =decode_json['data']
    if data['totalNum'] <=0 :
        print "server response totalnum is "+data['totalNum']
        return []
    blog_list=data['list']
    return blog_list

def getUserBlog(uin,blogid):
    url='''
    http://b1.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=%(uin)s&blogid=%(blogid)s&styledm=ctc.qzonestyle.gtimg.cn&imgdm=ctc.qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15&timestamp=1437033537&dprefix=&inCharset=gb2312&outCharset=gb2312&ref=qzone
    '''%{'uin':uin,'blogid':blogid}

    my_headers={
    "Accept-Encoding":"gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6" ,
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36" ,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ,
    "Referer": "http://ctc.qzs.qq.com/qzone/newblog/blogcanvas.html"
    }
    request = urllib2.Request(url,headers=my_headers)
    try:
        response = urllib2.urlopen(request)
    except URLError,e:
        if hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request. errorcode:{0}'.format(e.code ))
        elif hasattr(e, 'reason'):
            print('We failed to reach a server. reason:{0}'.format(e.reason ))
    else:
        page = response.read()
        return page

    return ""

def getText(elem):
    rc = []
    for node in elem.itertext():
        rc.append(node.strip())
    return ''.join(rc)

def gzdecode(data) :
    compressedstream = StringIO.StringIO(data)
    gziper = gzip.GzipFile(fileobj=compressedstream)
    data2 = gziper.read()   # 读取解压缩后数据
    return data2

def test(blogid):
    print blogid
    blog_data=getUserBlog('859226880',blogid)
    blog_data=gzdecode(blog_data)
    #write_file( blogid+'.html',blog_data )
    #return
    try:
        content=blog_data.decode('utf-8')
        tree=etree.HTML(content)
        node=tree.xpath("//div[@id='blogDetailDiv']")[0]
        tgt_data=getText(node)
        print "*"*30
        print tgt_data
        write_file( blogid+'.txt',tgt_data, 'gbk')
        return
    except  Exception,ex :
        print "111",Exception,":",ex
        try:
            content=blog_data.decode('gbk')
            tree=etree.HTML(content)
            node=tree.xpath("//div[@id='blogDetailDiv']")[0]
            tgt_data=getText(node)
            print "_"*30
            print tgt_data
            write_file( blogid+'.txt',tgt_data ,'utf-8')
        except  Exception,ex :
            print "222",Exception,":",ex


def main():
    print "main"
    test("1288281044")
    #return
    blog_list=getUserBlogList()
    for blog_item in blog_list:
        blogId=blog_item['blogId']
        print blogId
        test( str(blogId) )
    pass


main()

发表于 2015-07-16 21:49 长寿梦阅读(515) 评论(0) 编辑收藏引用所属分类: python片段

抓取QQ空间里某个号码的日志，python的乱码是个问题啊

常用链接

留言簿(3)

随笔分类(81)

随笔档案(86)

文章分类(34)

文章档案(37)

c++博客

技术论坛

网络安全和黑客技术

资源

搜索

积分与排名

最新评论

阅读排行榜

评论排行榜

长寿梦的编程日常 2007年开始编程生涯，迄今已通多种语言，未来以期计算机技术结合某一专业领域，传授智慧给计算机智能辅助改进某一行业，成为顶级的领域处理专家。
C++博客 \| 首页 \| 发新随笔 \| 发新文章 \| \| 聚合 \| 管理	随笔：86 文章：37 评论：48 引用：0