socketref,再见！高德

https://github.com/adoggie

C++博客 :: 首页 :: 联系 :: 聚合

:: 管理

246 Posts :: 4 Stories :: 312 Comments :: 0 Trackbacks

常用链接

留言簿(54)

我参与的团队

随笔分类

随笔档案

文章分类

文章档案

相册

Gis

OpenSource

搜索

阅读排行榜

评论排行榜

抓www.xunbao173.com的交易记录

5173.com提出抓取同行交易系统的业务信息来做数据分析而提出这么个需求给我，花了1天用python完成

  1 # -*- coding:utf-8 -*-
  2 #扫描xunbao173.com web页面记录到数据库
  3 #zhangbin 2010.5.12  5173.com
  4 import sys,os
  5
  6 import traceback,threading,time,struct,os,os.path,zlib,struct
  7 import copy,socket,select
  8 #import psycopg2
  9 import httplib,re
10
11 import log,config
12
13
14 #function Paging(total) {
15 #    this.pageSize = 10;//每页显示记录数
16 #    this.step = 5;//最多显示分页页数
17 #    this.total = total; //总记录数
18 #}
19
20 '''
21
22 CREATE DATABASE htmlgrep
23   WITH OWNER = postgres
24        ENCODING = 'UTF8';
25
26
27 CREATE TABLE htmlGrep
28 (
29    id integer,
30    item_name character(60) NOT NULL,
31    price_s character(40),
32      rank integer,
33    appear_time integer NOT NULL,
34    disappear_time integer NOT NULL
35 ) WITH (OIDS=TRUE)
36 ;
37
38 '''
39
40 g_conf = config.SimpleConfig()
41 g_conf.open('grep.conf')
42 g_dbconn = None
43 g_logger = log.Logger('hgrep.log')
44 g_flog = None
45
46 g_cookie = ''
47
48 #def getDBConn():
49 #    global g_dbconn
50 #    try:
51 #        if g_dbconn == None:
52 #            dbhost=g_conf.getPropertyValue('dbhost','localhost')
53 #            dbname='gamegrep'
54 #            dbuser=g_conf.getPropertyValue('dbuser','postgres')
55 #            dbpasswd=g_conf.getPropertyValue('dbpasswd','111111')
56 #            g_dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd)
57 #    except:
58 #        g_logger.error(traceback.format_exc())
59 #    return g_dbconn
60
61 #检索页数量
62
63 '''
64 <input type="hidden" id="currentPage" value="1"/>
65 <input type="hidden" id="orderBy" value=""/>
66 <input type="hidden" id="pageTotal" value="24"/>
67 '''
68
69 def getPageNum(html):
70     ms = re.findall("id=\"pageTotal\" value=\"(.*?)\"",html,re.S)
71     if len(ms)==0:
72         pass
73         #return 0
74     return ms[0]
75
76 def getPageHtml(game,idx):
77     html =''
78     url = "/%s/getServerList"%(game)
79     gamesite=g_conf.getPropertyValue('root.site')
80     conn = httplib.HTTPConnection(gamesite)
81
82     hdr={'Cookie':g_cookie,
83                      'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84                      'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7',
85                      'Accept-Language':'zh-cn,zh;q=0.5',
86                      'Keep-Alive':'300',
87                      'Connection':'keep-alive',
88                      'Accept-Encoding':'gzip,deflate'
89                      }
90
91     conn.request("GET", "/%s/buy.gsp?keyWord=&groupName=&orderBy=&page=%s"%(game,idx),'',hdr)
92     r1 = conn.getresponse()
93     html = r1.read()
94     return html
95
96
97 def scanRecordsOfHtml(f,html,serverid,serverName):
98
99     regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)','''
100     ms = re.findall(regex,html,re.S)
101     #f = open('hgrep.rec.txt','w')
102     #conn = getDBConn()
103     for n in ms:
104         f.write("%s,%s,%s,%s,%s,%s\n"%(n[0],n[1],n[2],n[3],serverid,serverName))
105
106
107 #扫描游戏服务器 [{name,url}]
108 def scanGameServers(game):
109     global g_cookie
110     url = "/%s/getServerList"%(game)
111     gamesite=g_conf.getPropertyValue('root.site')
112     conn = httplib.HTTPConnection(gamesite)
113     conn.request("GET", url)
114     r1 = conn.getresponse()
115     html = r1.read()
116     #print html
117     ms = re.findall("<div class=\"ser_area_list\">(.*?)</div>",html,re.S)
118     if len(ms)!=2:
119         print 'Html content invalid!'
120         return
121     html = ms[1]
122     #ms = re.findall("<a href=\"(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S)
123     #"getServerList?aid=15&id=1136"
124     ms = re.findall("<a href=\".*?aid=(.*?)&id=(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S)
125
126     if len(ms) == 0:
127         print 'Game:%s is null!'%(game)
128         return
129     print '%s servers Found'%len(ms)
130     #print r1.getheader('set-cookie')
131     cookie = r1.getheader('set-cookie').split(';')[0]
132     g_cookie =  cookie
133     #进入游戏区服
134     scanedserver =[]
135
136     for server in ms:
137         try:
138
139             #if scanedserver.count(server[1])!=0:
140             #    continue
141             #scanedserver.append(server[1])
142             print server
143             f = open(server[2].decode('utf-8').encode('gb2312')+'.txt','w')
144             print "/%s/%s"%(game,server[0])
145             url = "getServerList?aid=%s&id=%s"%(server[:2])
146             print url
147             #return url
148             conn = httplib.HTTPConnection(gamesite)
149
150             #conn.request("GET", "/%s/%s/"%(game,server[0]),None,{'Cookie':cookie})
151             hdr={'Cookie':cookie,
152                      'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
153                      'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7',
154                      'Accept-Language':'zh-cn,zh;q=0.5',
155                      'Keep-Alive':'300',
156                      'Connection':'keep-alive',
157                      'Accept-Encoding':'gzip,deflate'
158                      }
159
160             #conn.request("GET", "/%s/%s"%(game,server[0]),'',hdr)
161             conn.request("GET", "/%s/%s"%(game,url),'',hdr)
162             r1 = conn.getresponse()
163             html = r1.read()
164             print 'have a sleep

'
165             time.sleep(.2)
166
167             conn = httplib.HTTPConnection(gamesite)
168             conn.request("GET", "/%s/%s"%(game,'buy.gsp'),'',hdr)
169             r1 = conn.getresponse()
170             html = r1.read()
171
172             PAGE_SIZE =10
173             PAGE_COUNT =  int(getPageNum(html))/PAGE_SIZE + 1
174
175             for page in range(1,PAGE_COUNT+1):
176                 print 'attempt to grep Game=%s Page=%s

'%(game,page)
177                 html = getPageHtml(game,page)
178                 g_flog.write( html)
179
180                 scanRecordsOfHtml(f,html,server[1],server[2])
181             f.close()
182
183         except:
184             g_logger.error(traceback.format_exc())
185
186
187
188 #def scanGameServers2(game):
189 #    url = "/%s/buy.gsp"%(game)
190 #    gamesite=g_conf.getPropertyValue('root.site')
191 #    conn = httplib.HTTPConnection(gamesite)
192 #    conn.request("GET", url)
193 #    print url
194 #    r1 = conn.getresponse()
195 #    html = r1.read()
196 #    #print html
197 #    print html
198 #    g_flog.write( html)
199 #
200 #def scanRecords(file):
201 #    f = open(file,'r')
202 #    html = f.read()
203 #    f.close()
204 #    regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)','''
205 #    ms = re.findall(regex,html,re.S)
206 #    f = open('hgrep.rec.txt','w')
207 #    conn = getDBConn()
208 #
209 #    for n in ms:
210 #        f.write("%s,%s,%s,%s\n"%n)
211 #        try:
212 #            cr = conn.cursor()
213 #            sql = "select count(*) from htmlgrep where id=%s"%(n[3])
214 #            cr.execute(sql)
215 #
216 #            rs = cr.fetchone()
217 #            if rs[0] == 0 :
218 #            #if 1:
219 #                #cr = conn.cursor()
220 #                sql="insert into htmlgrep values(%s,%s,%s,%s,%s,%s);"
221 #                cr.execute(sql,( int(n[3]),n[0],n[2],n[1],int(time.time()),0,))
222 #                conn.commit()
223 #            else:
224 #                sql = "update htmlgrep set disappear_time=0 where id=%s"%(int(n[3]))
225 #                cr.execute(sql)
226 #                conn.commit()
227 #
228 #        except:
229 #            g_logger.error(traceback.format_exc())
230 #    #如果db内的数据不存在当前缓存内则标记为物品消失,并记录消失时间
231 #    cr = conn.cursor()
232 #    cr.execute('select id from htmlgrep order by id')
233 #    rs = cr.fetchone()
234 #    while rs:
235 #        found = False
236 #        for n in ms:
237 #            if int(n[3]) == rs[0]:
238 #                found = True
239 #                break
240 #        if not found:
241 #            cr2 = conn.cursor()
242 #            sql = "update htmlgrep set disappear_time=%s where id=%s"%(int(time.time()),rs[0])
243 #            cr2.execute(sql)
244 #        rs = cr.fetchone()
245 #        conn.commit()
246 #    f.close()
247 #    #print str(ms)
248 ##############################################################
249
250
251 class sepApp:
252     def    __init__(self):
253         self._conf = config.SimpleConfig()
254
255
256     def getConfig(self):
257         return self._conf
258
259     #def getDBConn(self):
260     #    try:
261     #        if self.dbconn == None:
262     #            dbhost=self.getPropertyValue('dbhost','localhost')
263     #            dbname=self.getPropertyValue('dbname','IpRedirect')
264     #            dbuser=self.getPropertyValue('dbuser','postgres')
265     #            dbpasswd=self.getPropertyValue('dbpasswd','111111')
266     #            self.dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd)
267     #    except:
268     #        self._log.error(traceback.format_exc())
269     #    return self.dbconn
270
271     #def resetDBConn(self):
272     #    self.dbconn = None
273
274     def run(self):
275         pass
276
277     def getLogger(self):
278         return self._log
279
280     def run(self, args):
281
282         return 0
283
284
285
286 ##############################################################
287 ##############################################################
288
289 #scanRecords('c:/test - Copy.html')
290
291
292 '''
293 sql test:
294 ---------------------
295 --select count(*) from htmlgrep
296 --select id,count(id) from htmlgrep group by id limit 100
297 --select * from htmlgrep where id = 2310
298 --delete from htmlgrep
299
300 '''
301 g_flog = open('c:/test.txt','w')
302 scanGameServers('mhzx')
303 sys.exit(0)
304 if __name__=='__main__':
305     if len(sys.argv)<2:
306         print 'usage: grep.py scan | build'
307         sys.exit()
308     if sys.argv[1]=='scan':
309         g_flog = open('c:/test.txt','w')
310         scanGameServers('mhzx')
311     #if sys.argv[1]=='build':
312     #    scanRecords('c:/test.txt')
313     #server = sepApp()
314
315
316
317
318

posted on 2010-06-10 23:23 放屁阿狗阅读(6252) 评论(1) 编辑收藏引用所属分类: perl/python/php/lua/tcl

Feedback

# re: 抓www.xunbao173.com的交易记录 2016-02-27 14:26 王正顺

1 回复更多评论

刷新评论列表

只有注册用户登录后才能发表评论。


相关文章: salesforce的oauth授权 python::简单数据对象到Object自动转换 python::配置文件简单读取 SimpleConfig python::代码学习 python编写网络通信框架-基于线程的消息传送 gis:ks102 设备解码 gis::ks108设备接入解码 Ogr::简化LineString输出到shp文件如何Python编写COM server python::写ctypes的好帮手pyglet.wrap.py

网站导航: 博客园 IT新闻 BlogJava 知识库博问管理