学着站在巨人的肩膀上

windows下配置python ，django，mysql，memcahe开发环境

这里做个记录，[分布式跨平台监控系统]肯定离不开的要配置windows下的环境，linux的一键安装程序有apt，zyyper，yum等傻瓜工具，windows下有时候还成了问题。

1，将windows版python2.5装入d：/python25，将d：/python25添加入环境变量path

2，下载下django，在django目录里运行，python setup.py install ，会自动查找path环境变量，将django的库放入d：/python25/lib

3，装一个setuptools-0.6c11.win32-py2.5.rar，会在 D:\Python25\Scripts 中出现 easy_install.exe

4，装mysql api和memcahe api，在D:\Python25\Scripts 目录下运行 easy_install.exe install mysqldb，或easy_install.exe install memcahe提示要去 http://pypi.python.org/simple/ 找具体下载安装的包，打开网址找到相应的url然后 easy_install.exe install url即可

5，如果没有自动安装程序 setuptools-0.6c11.win32-py2.5 或装不了，可以直接复制以前 D:\Python25\Lib\site-packages下的 MySQLdb 文件夹到现在的D:\Python25\Lib\site-packages的目录下，只要版本对得上就可以正常运行，完全绿色的。

6，如果url安装不了，以前也没有用过，可以下载安装包，例如memcache的api安装可以去这里ftp://ftp.tummy.com/pub/python-memcached/old-releases/python-memcached-1.45.tar.gz 下载

然后解压进入目录执行python setup.py install

7，建立新的django项目或在以往的工程目录下运行 python manage.py syncdb （这里只会检测库中的表，没有表明就建立，如果有表明，结构被改变了是不会做任何修改的），同步数据库表结构，事先要在mysql里建立setting.py里设置的数据库。

posted @ 2010-03-15 19:25 学者站在巨人的肩膀上阅读(685) | 评论 (0) | 编辑收藏

[分布式跨平台监控系统]linux，windows下一句话发邮件-python脚本应用

前一阵花了点时间学习python，近段时间完成了一个监控服务器基本信息的项目，都是为了满足大家监控的欲望，特殊日志并报警的分布式系统，单台服务器采集粒度为1次/1分钟，一天大约1440条，目前监控了20多台服务器，一天大约31680条日志，现在单点监控中心服务器在性能上还绰绰有余，有更多的服务器来测试就好了，估计可以支持到100台以上服务器监控的级别。

现在遇到一个需求是发现报警时实时发送消息给相关人员，由于公司短信网关只买了上海电信用户没有上海电信的号码，汗一个，只好通过发邮件来实施。

支持发送GB18030编码的文本内容，任意编码附件，可以做出适当修改支持群发。

·········10········20········30········40········50········60········70········80········90········100·······110·······120·······130·······140·······150

#coding=utf-8
#!/usr/lib/python2.5/bin/python
import os
import sys
from smtplib import SMTP
from email.MIMEMultipart import MIMEMultipart
from email.mime.application import MIMEApplication
from email.MIMEText import MIMEText
from email.MIMEBase import MIMEBase
from email import Utils,Encoders
import mimetypes
import time
STMP_SERVER = "mail.×××.com"
STMP_PORT = "25"
USERNAME = "×××@×××.com"
USERPASSWORD = "×××"
FROM = "MonitorCenterWarning@×××.com"
TO = "×××@gmail.com"
def sendFildByMail(config):
print 'Preparing...'
message = MIMEMultipart( )
message['from'] = config['from']
message['to'] = config['to']
message['Reply-To'] = config['from']
message['Subject'] = config['subject']
message['Date'] = time.ctime(time.time())
message['X-Priority'] = '3'
message['X-MSMail-Priority'] = 'Normal'
message['X-Mailer'] = 'Microsoft Outlook Express 6.00.2900.2180'
message['X-MimeOLE'] = 'Produced By Microsoft MimeOLE V6.00.2900.2180'
if 'file' in config:
#添加附件
f=open(config['file'], 'rb')
file = MIMEApplication(f.read())
f.close()
file.add_header('Content-Disposition', 'attachment', filename= os.path.basename(config['file']))
message.attach(file)
if 'content' in config:
#添加文本内容
f=open(config['content'], 'rb')
f.seek(0)
content = f.read()
body = MIMEText(content, 'base64', 'gb2312')
message.attach(body)
print 'OKay'
print 'Logging...'
smtp = SMTP(config['server'], config['port'])
#如果SMTP服务器发邮件时不需要验证登录则对下面这行加上注释
smtp.login(config['username'], config['password'])
print 'OK'
print 'Sending...',
smtp.sendmail (config['from'], [config['from'], config['to']], message.as_string())
print 'OK'
smtp.close()
time.sleep(1)
if __name__ == "__main__":
if len(sys.argv) < 2:
print 'Usage: python %s contentfilename' % os.path.basename(sys.argv[0])
print 'OR Usage: python %s contentfilename attachfilename' % os.path.basename(sys.argv[0])
wait=raw_input("quit.")
sys.exit(-1)
elif len(sys.argv) == 2:
sendFildByMail({
'from': FROM,
'to': TO,
'subject': '[MonitorCenter]Send Msg %s' % sys.argv[1],
'content': sys.argv[1],
'server': STMP_SERVER,
'port': STMP_PORT,
'username': USERNAME,
'password': USERPASSWORD})
elif len(sys.argv) == 3:
sendFildByMail({
'from': FROM,
'to': TO,
'subject': '[MonitorCenter]Send Msg and File %s %s' % (sys.argv[1], sys.argv[2]),
'content': sys.argv[1],
'file': sys.argv[2],
'server': STMP_SERVER,
'port': STMP_PORT,
'username': USERNAME,
'password': USERPASSWORD})
wait=raw_input("end.")

#coding=utf-8
#!/usr/lib/python2.5/bin/python
import os
import sys
from smtplib import SMTP
from email.MIMEMultipart import MIMEMultipart
from email.mime.application import MIMEApplication
from email.MIMEText import MIMEText
from email.MIMEBase import MIMEBase
from email import Utils,Encoders
import mimetypes
import time
STMP_SERVER = "mail.×××.com"
STMP_PORT = "25"
USERNAME = "×××@×××.com"
USERPASSWORD = "×××"
FROM = "MonitorCenterWarning@×××.com"
TO = "×××@gmail.com"
def sendFildByMail(config):
print 'Preparing...'
message = MIMEMultipart( )
message['from'] = config['from']
message['to'] = config['to']
message['Reply-To'] = config['from']
message['Subject'] = config['subject']
message['Date'] = time.ctime(time.time())
message['X-Priority'] =  '3'
message['X-MSMail-Priority'] =  'Normal'
message['X-Mailer'] =  'Microsoft Outlook Express 6.00.2900.2180'
message['X-MimeOLE'] =  'Produced By Microsoft MimeOLE V6.00.2900.2180'
if 'file' in config:
#添加附件
f=open(config['file'], 'rb')
file = MIMEApplication(f.read())
f.close()
file.add_header('Content-Disposition', 'attachment', filename= os.path.basename(config['file']))
message.attach(file)
if 'content' in config:
#添加文本内容
f=open(config['content'], 'rb')
f.seek(0)
content = f.read()
body = MIMEText(content, 'base64', 'gb2312')
message.attach(body)
print 'OKay'
print 'Logging...'
smtp = SMTP(config['server'], config['port'])
#如果SMTP服务器发邮件时不需要验证登录则对下面这行加上注释
smtp.login(config['username'], config['password'])
print 'OK'
print 'Sending...',
smtp.sendmail (config['from'], [config['from'], config['to']], message.as_string())
print 'OK'
smtp.close()
time.sleep(1)
if __name__ == "__main__":
if len(sys.argv) < 2:
print 'Usage: python %s contentfilename' % os.path.basename(sys.argv[0])
print 'OR Usage: python %s contentfilename attachfilename' % os.path.basename(sys.argv[0])
wait=raw_input("quit.")
sys.exit(-1)
elif len(sys.argv) == 2:
sendFildByMail({
'from': FROM,
'to': TO,
'subject': '[MonitorCenter]Send Msg %s' % sys.argv[1],
'content': sys.argv[1],
'server': STMP_SERVER,
'port': STMP_PORT,
'username': USERNAME,
'password': USERPASSWORD})
elif len(sys.argv) == 3:
sendFildByMail({
'from': FROM,
'to': TO,
'subject': '[MonitorCenter]Send Msg and File %s %s' % (sys.argv[1], sys.argv[2]),
'content': sys.argv[1],
'file': sys.argv[2],
'server': STMP_SERVER,
'port': STMP_PORT,
'username': USERNAME,
'password': USERPASSWORD})
wait=raw_input("end.")

windows xp下：

linux ubuntu，suse下：

收到的结果：

posted @ 2010-03-15 19:24 学者站在巨人的肩膀上阅读(703) | 评论 (0) | 编辑收藏

[分布式跨平台监控系统]linux下监控网络流量和网速-python脚本应用

由于上证所，深交所level1，level2金融数据服务器在上午9：00开始到11：30和下午13：00开始到15：30一共大约5个小时的时间内流量比较大所以被监控服务器的网络流速算是一个被监控的重要指标。可以通过累加一段时间内各个网卡的上行，下行流量除以这个时间间隔计算出这段时间内的平均网速，我现在的采集频率是1分钟采集一次，在实际开盘期间运行过程中得到的网速监控信息用还是比较准确的，都保持在5M/S左右的速度，有时候在平时非服务期看见某台服务器的内网网卡网速达到5M/S ，果然就是有人在大手笔传输。

独立的监控脚本是返回一个列表嵌套元组的数据结构，最后再汇总成一个完整的XML数据岛，为了调试方便脚本的每一个中间结果都导出到一个临时文本中。

运行以下脚本要确定你的linux装了ethtool工具，在ubuntu2.6.27-7-server，ubuntu22.6.27.19-5-default，suse 2.6.27.19-5-default 测试通过。

代码：

view plain copy to clipboard print ?

#coding=utf-8
#!/usr/bin/python
import re
import os
import time
import utils
def sortedDictValues3(adict):
keys = adict.keys()
keys.sort()
return map(adict.get, keys)
def run():
if utils.isLinux() == False:
return [('ifconfig_collect os type error','this is windows')]
#not first run
if os.path.isfile('./oldifconfig'):
fileold = open('./oldifconfig', 'r')
fileold.seek(0)
#读入上次记录的临时流量数据文件，和时间戳
(oldtime, fileoldcontent) = fileold.read().split('#')
fileold.close;
netcard = {}
tempstr = ''
key = ''
for strline in fileoldcontent.split('\n'):
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
reobj = re.compile('^eth*.')
if reobj.search(strline):
key = strline.split()[0]
tempstr = tempstr + strline + '\n'
netcard[key] = tempstr
RXold = {}
TXold = {}
for key,value in netcard.items():
tempsplit = value.split('\n')
netcard[key] = ''
for item in tempsplit:
item = item + '<br>'
netcard[key] = netcard[key] + item
tempcount = 1
for match in re.finditer("(bytes:)(.*?)( \()", item):
if tempcount == 1:
RXold[key] = match.group(2)
tempcount = tempcount + 1
elif tempcount == 2:
TXold[key] = match.group(2)
netcard[key] = netcard[key] + 'net io percent(bytes/s): 0 <br>'
#记录当前网卡信息到临时文件中
os.system('ifconfig > ifconfigtemp')
file = open('./ifconfigtemp','r');
fileold = open('./oldifconfig', 'w')
temptimestr = str(int(time.time()));
fileold.write(temptimestr)
fileold.write('#')
file.seek(0)
fileold.write(file.read())
fileold.close()
returnkeys = []
returnvalues = []
netcard = {}
tempcountcard = 0
file.seek(0)
key = ''
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
reobj = re.compile('^eth*.')
if reobj.search(strline):
key = strline.split()[0]
netcard[key] = ''
netcard[key] = netcard[key] + strline
newnetcard = {}
file.seek(0)
key = ''
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
if re.search("^eth", strline):
templist = strline.split()
key = templist[0]
newnetcard[key] = ''
newnetcard[key] = templist[4] + newnetcard[key] + ' '
if re.search("^ *inet ", strline):
templist = strline.split()
newnetcard[key] = templist[1][5:] + ' ' + newnetcard[key] + ' '
for key,value in newnetcard.items():
#记录每张网卡是否工作状态信息到临时文件
os.system('ethtool %s > ethtooltemp'%(key))
file = open('./ethtooltemp','r');
tempethtooltemplist = file.read().split('\n\t')
file.close
if re.search("yes", tempethtooltemplist[-1]):
templist = newnetcard[key].split()
newnetcard[key] = templist[0] + ' runing! ' + templist[1]
else:
templist = newnetcard[key].split()
if len(templist) > 1:
newnetcard[key] = templist[0] + ' stop! ' + templist[1]
else:
newnetcard[key] = 'stop! ' + templist[0]
file.close()
RX = {}
TX = {}
for key,value in netcard.items():
tempsplit = value.split('\n')
netcard[key] = ''
for item in tempsplit:
item = item + '<br>'
netcard[key] = netcard[key] + item
tempcount = 1
for match in re.finditer("(bytes:)(.*?)( \()", item):
if tempcount == 1:
RX[key] = str(int(match.group(2)) - int(RXold[key]))
tempcount = tempcount + 1
elif tempcount == 2:
TX[key] = str(int(match.group(2)) - int(TXold[key]))
divtime = float(int(time.time()) - int(oldtime))
if divtime == 0:
rate = (float(TX[key]) + float(RX[key]))
else:
rate = (float(TX[key]) + float(RX[key]))/(divtime)
if rate == 0:
newnetcard[key] = '0' + ' ' + newnetcard[key]
else:
newnetcard[key] = '%.2f'%rate + ' ' + newnetcard[key]
return zip(['order'], ['48']) + newnetcard.items();
else:
os.system('ifconfig > ifconfigtemp')
file = open('./ifconfigtemp','r');
fileold = open('./oldifconfig', 'w')
temptimestr = str(int(time.time()));
fileold.write(temptimestr)
fileold.write('#')
file.seek(0)
fileold.write(file.read())
fileold.close()
netcard = {}
file.seek(0)
key = ''
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
reobj = re.compile('^eth*.')
if reobj.search(strline):
key = strline.split()[0]
netcard[key] = ''
netcard[key] = netcard[key] + strline
RX = {}
TX = {}
key = ''
newnetcard = {}
file.seek(0)
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
if re.search("^eth", strline):
templist = strline.split()
key = templist[0]
newnetcard[key] = templist[4] + ' '
if re.search("^ *inet ", strline):
templist = strline.split()
newnetcard[key] = newnetcard[key] + templist[1][5:] + ' '
for key,value in newnetcard.items():
os.system('ethtool %s > ethtooltemp'%(key))
file = open('./ethtooltemp','r');
tempethtooltemplist = file.read().split('\n')
file.close
if re.search("yes", tempethtooltemplist[-1]):
newnetcard[key] = newnetcard[key] + 'runing!'
else:
newnetcard[key] = newnetcard[key] + 'stop!'
file.close()
for key,value in netcard.items():
tempsplit = value.split('\n')
netcard[key] = ''
for item in tempsplit:
item = item + '<br>'
#print item
netcard[key] = netcard[key] + item
tempcount = 1
for match in re.finditer("(bytes:)(.*?)( \()", item):
if tempcount == 1:
RX[key] = match.group(2)
tempcount = tempcount + 1
elif tempcount == 2:
TX[key] = match.group(2)
netcard[key] = netcard[key] + 'net io percent(bytes/s): 0 <br>'
newnetcard[key] = newnetcard[key] + ' ' + '0 <br>'
return zip(['order'], ['48']) + newnetcard.items();
if __name__ == '__main__':
print run()

#coding=utf-8
#!/usr/bin/python
import re
import os
import time
import utils
def sortedDictValues3(adict):
keys = adict.keys()
keys.sort()
return map(adict.get, keys)
def run():
if utils.isLinux() == False:
return [('ifconfig_collect os type error','this is windows')]
#not first run
if os.path.isfile('./oldifconfig'):
fileold = open('./oldifconfig', 'r')
fileold.seek(0)
#读入上次记录的临时流量数据文件，和时间戳
(oldtime, fileoldcontent) = fileold.read().split('#')
fileold.close;
netcard = {}
tempstr = ''
key = ''
for strline in fileoldcontent.split('\n'):
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
reobj = re.compile('^eth*.')
if reobj.search(strline):
key = strline.split()[0]
tempstr = tempstr + strline + '\n'
netcard[key] = tempstr
RXold = {}
TXold = {}
for key,value in netcard.items():
tempsplit = value.split('\n')
netcard[key] = ''
for item in tempsplit:
item = item + '<br>'
netcard[key] = netcard[key] + item
tempcount = 1
for match in re.finditer("(bytes:)(.*?)( \()", item):
if tempcount == 1:
RXold[key] = match.group(2)
tempcount = tempcount + 1
elif tempcount == 2:
TXold[key] = match.group(2)
netcard[key] = netcard[key] + 'net io percent(bytes/s): 0 <br>'
#记录当前网卡信息到临时文件中
os.system('ifconfig > ifconfigtemp')
file = open('./ifconfigtemp','r');
fileold = open('./oldifconfig', 'w')
temptimestr = str(int(time.time()));
fileold.write(temptimestr)
fileold.write('#')
file.seek(0)
fileold.write(file.read())
fileold.close()
returnkeys = []
returnvalues = []
netcard = {}
tempcountcard = 0
file.seek(0)
key = ''
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
reobj = re.compile('^eth*.')
if reobj.search(strline):
key = strline.split()[0]
netcard[key] = ''
netcard[key] = netcard[key] + strline
newnetcard = {}
file.seek(0)
key = ''
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
if re.search("^eth", strline):
templist = strline.split()
key = templist[0]
newnetcard[key] = ''
newnetcard[key] = templist[4] + newnetcard[key] + ' '
if re.search("^ *inet ", strline):
templist = strline.split()
newnetcard[key] = templist[1][5:] + ' ' + newnetcard[key] + ' '
for key,value in newnetcard.items():
#记录每张网卡是否工作状态信息到临时文件
os.system('ethtool %s > ethtooltemp'%(key))
file = open('./ethtooltemp','r');
tempethtooltemplist = file.read().split('\n\t')
file.close
if re.search("yes", tempethtooltemplist[-1]):
templist = newnetcard[key].split()
newnetcard[key] = templist[0] + ' runing! ' + templist[1]
else:
templist = newnetcard[key].split()
if len(templist) > 1:
newnetcard[key] = templist[0] + ' stop! ' + templist[1]
else:
newnetcard[key] =  'stop! ' + templist[0]
file.close()
RX = {}
TX = {}
for key,value in netcard.items():
tempsplit = value.split('\n')
netcard[key] = ''
for item in tempsplit:
item = item + '<br>'
netcard[key] = netcard[key] + item
tempcount = 1
for match in re.finditer("(bytes:)(.*?)( \()", item):
if tempcount == 1:
RX[key] = str(int(match.group(2)) - int(RXold[key]))
tempcount = tempcount + 1
elif tempcount == 2:
TX[key] = str(int(match.group(2)) - int(TXold[key]))
divtime = float(int(time.time()) - int(oldtime))
if divtime == 0:
rate = (float(TX[key]) + float(RX[key]))
else:
rate = (float(TX[key]) + float(RX[key]))/(divtime)
if rate == 0:
newnetcard[key] = '0' + ' ' + newnetcard[key]
else:
newnetcard[key] = '%.2f'%rate + ' ' + newnetcard[key]
return zip(['order'], ['48']) + newnetcard.items();
else:
os.system('ifconfig > ifconfigtemp')
file = open('./ifconfigtemp','r');
fileold = open('./oldifconfig', 'w')
temptimestr = str(int(time.time()));
fileold.write(temptimestr)
fileold.write('#')
file.seek(0)
fileold.write(file.read())
fileold.close()
netcard = {}
file.seek(0)
key = ''
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
reobj = re.compile('^eth*.')
if reobj.search(strline):
key = strline.split()[0]
netcard[key] = ''
netcard[key] = netcard[key] + strline
RX = {}
TX = {}
key = ''
newnetcard = {}
file.seek(0)
for strline in file.readlines():
reobj = re.compile('^lo*.')
if reobj.search(strline):
break;
if re.search("^eth", strline):
templist = strline.split()
key = templist[0]
newnetcard[key] = templist[4] + ' '
if re.search("^ *inet ", strline):
templist = strline.split()
newnetcard[key] = newnetcard[key] + templist[1][5:] + ' '
for key,value in newnetcard.items():
os.system('ethtool %s > ethtooltemp'%(key))
file = open('./ethtooltemp','r');
tempethtooltemplist = file.read().split('\n')
file.close
if re.search("yes", tempethtooltemplist[-1]):
newnetcard[key] = newnetcard[key] + 'runing!'
else:
newnetcard[key] = newnetcard[key] + 'stop!'
file.close()
for key,value in netcard.items():
tempsplit = value.split('\n')
netcard[key] = ''
for item in tempsplit:
item = item + '<br>'
#print item
netcard[key] = netcard[key] + item
tempcount = 1
for match in re.finditer("(bytes:)(.*?)( \()", item):
if tempcount == 1:
RX[key] = match.group(2)
tempcount = tempcount + 1
elif tempcount == 2:
TX[key] = match.group(2)
netcard[key] = netcard[key] + 'net io percent(bytes/s): 0 <br>'
newnetcard[key] = newnetcard[key] + ' ' + '0 <br>'
return zip(['order'], ['48']) + newnetcard.items();
if __name__ == '__main__':
print run()

使用例子：

每一个列表元素元组里面第二个元素第一个字段为网速 Bytes/S，例如eth1网卡的网速就是3.3KB/s，eth0网速是2.9KB/s，今天是周六这个流量很正常

posted @ 2010-03-15 19:22 学者站在巨人的肩膀上阅读(621) | 评论 (0) | 编辑收藏

自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[6]倒排索引的建立的程序分析(4)

以下是根据正向索引建立倒排索引的注释

int main(int argc, char* argv[])    //./CrtInvertedIdx moon.fidx.sort > sun.iidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo)
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum,tmp1="";
    int cnt = 0;
    while (getline(ifsImgInfo, strLine))
    {
        string::size_type idx;
        string tmp;

idx = strLine.find("\t");
tmp = strLine.substr(0,idx);

if (tmp.size()<2 || tmp.size() > 8) continue;

if (tmp1.empty()) tmp1=tmp;

        if (tmp == tmp1)
        {
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }
        else
        {
            if ( strDocNum.empty() )
                strDocNum = strDocNum + " " + strLine.substr(idx+1);

            cout << tmp1 << "\t" << strDocNum << endl;
            tmp1 = tmp;
            strDocNum.clear();
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }

        cnt++;
        //if (cnt==100) break;
    }
    cout << tmp1 << "\t" << strDocNum << endl; //倒排索引中每个字典单词后的文档编号以table键为间隔

return 0;
}

posted @ 2009-12-10 23:03 学者站在巨人的肩膀上阅读(1666) | 评论 (3) | 编辑收藏

自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[6]倒排索引的建立的程序分析(3)

这里介绍正向索引的建立，如果直接建立倒排索引效率上可能会很低，所以可以先产生正向索引为后面的倒排索引打下基础。

详细的文件功能和介绍都在这里有了介绍自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[5]倒排索引的建立及文件介绍

CrtForwardIdx.cpp文件

int main(int argc, char* argv[])    //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo)
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum;
    int cnt = 0;
    while (getline(ifsImgInfo, strLine))
    {
        string::size_type idx;

        cnt++;
        if (cnt%2 == 1) //奇数行为文档编号
        {
            strDocNum = strLine.substr(0,strLine.size());
            continue;
        }
        if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
        {
            continue;
        }

        while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //指定查找分界符
        {
            string tmp1 = strLine.substr(0,idx);
            cout << tmp1 << "\t" << strDocNum << endl;
            strLine = strLine.substr(idx + SEPARATOR.size());
        }

//if (cnt==100) break;
}

return 0;
}

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

posted @ 2009-12-10 23:02 学者站在巨人的肩膀上阅读(1254) | 评论 (1) | 编辑收藏

自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[6]倒排索引的建立的程序分析(2)

前面的DocIndex程序输入一个Tianwang.raw.*****文件，会产生一下三个文件 Doc.idx, Url.idx, DocId2Url.idx，我们这里对DocSegment程序进行分析。

这里输入 Tianwang.raw.*****，Doc.idx，Url.idx.sort_uniq等三个文件，输出一个Tianwang.raw.***.seg 分词完毕的文件

int main(int argc, char* argv[])
{
    string strLine, strFileName=argv[1];
    CUrl iUrl;
    vector<CUrl> vecCUrl;
    CDocument iDocument;
    vector<CDocument> vecCDocument;
    unsigned int docId = 0;

    //ifstream ifs("Tianwang.raw.2559638448");
    ifstream ifs(strFileName.c_str()); //DocSegment Tianwang.raw.****
    if (!ifs)
    {
        cerr << "Cannot open tianwang.img.info for input\n";
        return -1;
    }

    ifstream ifsUrl("Url.idx.sort_uniq");   //排序并消重后的url字典
    if (!ifsUrl)
    {
        cerr << "Cannot open Url.idx.sort_uniq for input\n";
        return -1;
    }
    ifstream ifsDoc("Doc.idx"); //字典文件
    if (!ifsDoc)
    {
        cerr << "Cannot open Doc.idx for input\n";
        return -1;
    }

    while (getline(ifsUrl,strLine)) //偏离url字典存入一个向量内存中
    {
        char chksum[33];
        int docid;

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%s%d", chksum, &docid );
        iUrl.m_sChecksum = chksum;
        iUrl.m_nDocId = docid;
        vecCUrl.push_back(iUrl);
    }

    while (getline(ifsDoc,strLine))     //偏离字典文件将其放入一个向量内存中
    {
        int docid,pos,length;
        char chksum[33];

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
        iDocument.m_nDocId = docid;
        iDocument.m_nPos = pos;
        iDocument.m_nLength = length;
        iDocument.m_sChecksum = chksum;
        vecCDocument.push_back(iDocument);
    }

    strFileName += ".seg";
    ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);    //设置完成分词后的数据输出文件
    for ( docId=0; docId<MAX_DOC_ID; docId++ )
    {

        // find document according to docId
        int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
        char *pContent = new char[length+1];
        memset(pContent, 0, length+1);
        ifs.seekg(vecCDocument[docId].m_nPos);
        ifs.read(pContent, length);

char *s;
s = pContent;

        // skip Head
        int bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        // skip header
        bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        //iDocument.m_sBody = s;
        iDocument.RemoveTags(s);    //去除<>
        iDocument.m_sBodyNoTags = s;

delete[] pContent;
string strLine = iDocument.m_sBodyNoTags;

CStrFun::ReplaceStr(strLine, " ", " ");
CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "

        // segment the document 具体分词处理
        CHzSeg iHzSeg;
        strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
        fout << docId << endl << strLine;
        fout << endl;

    }

return(0);
}
这里只是浮光掠影式的过一遍大概的代码，后面我会有专题详细讲解 parse html 和 segment docment 等技术

posted @ 2009-12-10 23:02 学者站在巨人的肩膀上阅读(1209) | 评论 (1) | 编辑收藏

自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[6]倒排索引的建立的程序分析(1)

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

上一篇主要介绍了倒排索引建立相关的文件及中间文件。
TSE建立索引在运行程序上的大致步骤可以简化分为以下几步：

1、运行命令#./DocIndex
会用到一个文件 tianwang.raw.520 //爬取回来的原始文件，包含多个网页的所有信息，所以很大，这也是一个有待解决的问题，到底存成大文件（如果过大会超过2G或4G的限制，而且文件过大索引效率过低）还是小文件（文件数过多用于打开关闭文件句柄的消耗过大）还有待思考，还就是存储方案的解决最终肯定是要存为分布式的，最终总文件量肯定是会上TB的，TSE只支持小型的搜索引擎需求。
会产生一下三个文件 Doc.idx, Url.idx, DocId2Url.idx //Data文件夹中的Doc.idx DocId2Url.idx和Doc.idx

2、运行命令#sort Url.idx|uniq > Url.idx.sort_uniq //Data文件夹中的Url.idx.sort_uniq
会用到一个文件 Url.idx文件 //md5 hash 之后的url完整地址和document id值对
会产生一个文件 Url.idx.sort_uniq //URL消重，md5 hash排序，提高检索效率

3、运行命令#./DocSegment Tianwang.raw.2559638448
会用到一个文件 Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文件，每个页面包含http头，分词为后面建立到排索引做准备
会产生一个文件 Tianwang.raw.2559638448.seg //分词文件，由一行document id号和一行文档分词组（只对每个文档<html></html>中<head></head><body></body>等文字标记中的文本进行分组）构成

4、运行命令#./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //建立独立的正向索引

5、运行命令
#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6、运行命令#./CrtInvertedIdx moon.fidx.sort > sun.iidx //建立倒排索引

我们先从建立索引的第一个程序DocIndex.cpp开始分析。(注释约定：Tianwang.raw.2559638448是抓回来合并成的大文件，后面就叫大文件，里面包含了很多篇html文档，里面的文档有规律的分隔就叫做一篇一篇的文档)

//DocIndex.h start-------------------------------------------------------------

#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include
#include
#include
#include
#include
#include
#include

using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //前台搜索结果数据集返回条数

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp中要用到
const unsigned MAX_DOC_IDX_ID = 22104;

//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //倒排索引文件
//朱德 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古力 1085 1222

//9万多条字元文件包括特殊符号，标点，汉字
const string DOC_IDX_NAME("./Data/Doc.idx"); //倒排索引文件
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp中要用到
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp中要用到

//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");

#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------

#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
//ifstream ifs("Tianwang.raw.3023555472");
//iceway(mnsc)
ifstream ifs(DOC_FILE_NAME.c_str()); //打开Tianwang.raw.3023555472文件，最原始的文件
if (!ifs)
{
    cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
    return -1;
    }
ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立并打开Url.idx文件
if( !ofsUrl )
{
  cout << "error open file " << endl;
}

ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立并打开Doc.idx文件
if( !ofsDoc )
{
cout << "error open file " << endl;
}

ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立并打开DocId2Url.idx文件
if( !ofsDocId2Url )
{
cout << "error open file " << endl;
}

int cnt=0; //文档编号从0开始计算
string strLine,strPage;
CUrl iUrl;
CDocument iDocument;
CMD5 iMD5;

int nOffset = ifs.tellg();
while (getline(ifs, strLine))
{
  if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
  {
   nOffset = ifs.tellg();
   continue;
  }

  if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //判断第一行是否是version: 1.0如果是就解析下去
  {
   if(!getline(ifs, strLine)) break;
   if (!strncmp(strLine.c_str(), "url: ", 4)) //判断第二行是否是url: 如果是则解析下去
   {
    iUrl.m_sUrl = strLine.substr(5); //截取url: 五个字符之后的url内容
    iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //对url用md5 hash处理
    iUrl.m_sChecksum = iMD5.ToString(); //将字符数组组合成字符串这个函数在Md5.h中实现

   } else
   {
    continue;
   }

   while (getline(ifs, strLine))
   {
    if (!strncmp(strLine.c_str(), "length: ", 8)) //一直读下去直到判断澹澹(相对第五行)惺欠袷莑ength: 是则接下下去
    {
     sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //将该块所代表网页的实际网页内容长度放入iDocument数据结构中
     break;
    }
   }

getline(ifs, strLine); //跳过相对第六行故意留的一个空行

   iDocument.m_nDocId = cnt; //将文档编号赋值到iDocument数据结构中
   iDocument.m_nPos = nOffset; //文档结尾在大文件中的结束行号
   char *pContent = new char[iDocument.m_nLength+1]; //新建该文档长度的字符串指针

   memset(pContent, 0, iDocument.m_nLength+1); //每一位初始化为0
   ifs.read(pContent, iDocument.m_nLength); //根据获得的文档长度读取澹(其中包含协议头)读取文档内容
   iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
   iDocument.m_sChecksum = iMD5.ToString(); //将字符数组组合成字符串这个函数在Md5.h中实现

   delete[] pContent;

   ofsUrl << iUrl.m_sChecksum ; //将md5hash后的url写入Url.idx文件
   ofsUrl << "\t" << iDocument.m_nDocId << endl; //在一行中一个tab距离分隔，将文件编号写入Url.idx文件

   ofsDoc << iDocument.m_nDocId ; //将文件编号写入Doc.idx文件
   ofsDoc << "\t" << iDocument.m_nPos ; //在一行中一个tab距离分隔，将该文档结束行号澹(同样也是下一文档开始行号)写入Doc.idx文件
   //ofsDoc << "\t" << iDocument.m_nLength ;
   ofsDoc << "\t" << iDocument.m_sChecksum << endl; //在一行中一个tab距离分隔，将md5hash后的url写入Doc.idx文件

ofsDocId2Url << iDocument.m_nDocId ; //将文件编号写入DocId2Url.idx文件
ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //将该文档的完整url写入DocId2Url.idx文件

cnt++; //文档编号加一说明该以文档分析完毕，生成下一文档的编号
}

nOffset = ifs.tellg();

}

//最后一行只有文档号和上一篇文档结束号
ofsDoc << cnt ;
ofsDoc << "\t" << nOffset << endl;

return(0);
}

//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

posted @ 2009-12-10 23:00 学者站在巨人的肩膀上阅读(1395) | 评论 (1) | 编辑收藏

自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[5]倒排索引的建立及文件介绍

不好意思让大家久等了，前一阵一直在忙考试，终于结束了。呵呵！废话不多说了下面我们开始吧！

TSE用的是将抓取回来的网页文档全部装入一个大文档，让后对这一个大文档内的数据整体统一的建索引，其中包含了几个步骤。

view plaincopy to clipboardprint?
1. The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

//Doc.idx 文档编号文档长度    checksum hash码

0   0   bc9ce846d7987c4534f53d423380ba70

1   76760   4f47a3cad91f7d35f4bb6b2a638420e5

2   141624 d019433008538f65329ae8e39b86026c

3   142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx   end

The url index (url.idx) is used to convert URLs into docIDs.

//url.idx

5c36868a9c5117eadbda747cbdb0725f    0

3272e136dd90263ee306a835c6c70d77    1

6b8601bb3bb9ab80f868d549b5c5a5f3    2

3f9eba99fa788954b5ff7f35a5db6e1f    3

//url.idx   end

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

    ./DocIndex

        got Doc.idx, Url.idx, DocId2Url.idx //Data文件夹中的Doc.idx DocId2Url.idx和Doc.idx中

//DocId2Url.idx

0   http://*.*.edu.cn/index.aspx

1   http://*.*.edu.cn/showcontent1.jsp?NewsID=118

2   http://*.*.edu.cn/0102.html

3   http://*.*.edu.cn/0103.html

//DocId2Url.idx end

2. sort Url.idx|uniq > Url.idx.sort_uniq    //Data文件夹中的Url.idx.sort_uniq

//Url.idx.sort_uniq

//对hash值进行排序

000bfdfd8b2dedd926b58ba00d40986b    1111

000c7e34b653b5135a2361c6818e48dc    1831

0019d12f438eec910a06a606f570fde8    366

0033f7c005ec776f67f496cd8bc4ae0d    2103

3. Segment document to terms, (with finding document according to the url)

    ./DocSegment Tianwang.raw.2559638448        //Tianwang.raw.2559638448为爬回来的文件，每个页面包含http头

        got Tianwang.raw.2559638448.seg

//Tianwang.raw.2559638448   爬取的原始网页文件在文档内部每一个文档之间应该是通过version，</html>和回车做标志位分割的

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

"http://www.w3.org/TR/html4/loose.dtd">

<html>

<head>

<title>Apabi数字资源平台</title>

<meta http-equiv="Content-Type" content="text/html; charset=gb2312">

<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">

<META NAME="DESCRIPTION" CONTENT="数字图书馆方正数字图书馆电子图书电子书 ebook e书 Apabi 数字资源平台">

<link rel="stylesheet" type="text/css" href="css\common.css">

<style type="text/css">



</style>

<script LANGUAGE="vbscript">

...

</script>

<Script Language="javascript">

...

</Script>

</head>

<body leftmargin="0" topmargin="0">

</body>

</html>

//Tianwang.raw.2559638448   end

//Tianwang.raw.2559638448.seg   将每个页面分成一行如下(注意中间没有回车作为分隔)

1

...

...

...

2

...

...

...

//Tianwang.raw.2559638448.seg   end

//下是 Tiny search 非必须因素

4. Create forward index (docic-->termid)     //建立正向索引

    ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

//Tianwang.raw.2559638448.seg 将每个页面分成一行如下<BR>//分词   DocID<BR>1<BR>三星/ s/ 手机/ 论坛/ ,/ 手机/ 铃声/ 下载/ ,/ 手机/ 图片/ 下载/ ,/ 手机/<BR>2<BR>...<BR>...<BR>...

1. The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

//Doc.idx 文档编号文档长度 checksum hash码

0 0 bc9ce846d7987c4534f53d423380ba70

1 76760 4f47a3cad91f7d35f4bb6b2a638420e5

2 141624 d019433008538f65329ae8e39b86026c

3 142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx end

The url index (url.idx) is used to convert URLs into docIDs.

//url.idx

5c36868a9c5117eadbda747cbdb0725f 0

3272e136dd90263ee306a835c6c70d77 1

6b8601bb3bb9ab80f868d549b5c5a5f3 2

3f9eba99fa788954b5ff7f35a5db6e1f 3

//url.idx end

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

./DocIndex

got Doc.idx, Url.idx, DocId2Url.idx //Data文件夹中的Doc.idx DocId2Url.idx和Doc.idx中

//DocId2Url.idx

0 http://*.*.edu.cn/index.aspx

1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118

2 http://*.*.edu.cn/0102.html

3 http://*.*.edu.cn/0103.html

//DocId2Url.idx end

2. sort Url.idx|uniq > Url.idx.sort_uniq //Data文件夹中的Url.idx.sort_uniq

//Url.idx.sort_uniq

//对hash值进行排序

000bfdfd8b2dedd926b58ba00d40986b 1111

000c7e34b653b5135a2361c6818e48dc 1831

0019d12f438eec910a06a606f570fde8 366

0033f7c005ec776f67f496cd8bc4ae0d 2103

3. Segment document to terms, (with finding document according to the url)

./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文件，每个页面包含http头

got Tianwang.raw.2559638448.seg

//Tianwang.raw.2559638448 爬取的原始网页文件在文档内部每一个文档之间应该是通过version，</html>和回车做标志位分割的

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

"http://www.w3.org/TR/html4/loose.dtd">

<html>

<head>

<title>Apabi数字资源平台</title>

<!--

.style4 {color: #666666}

-->

</style>

...

</script>

...

</Script>

</head>

</body>

</html>

//Tianwang.raw.2559638448 end

//Tianwang.raw.2559638448.seg 将每个页面分成一行如下(注意中间没有回车作为分隔)

1

...

2

...

//Tianwang.raw.2559638448.seg end

//下是 Tiny search 非必须因素

4. Create forward index (docic-->termid) //建立正向索引

./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

//Tianwang.raw.2559638448.seg 将每个页面分成一行如下//分词   DocID1三星/ s/ 手机/ 论坛/ ,/ 手机/ 铃声/ 下载/ ,/ 手机/ 图片/ 下载/ ,/ 手机/2.........view plaincopy to clipboardprint?
//Tianwang.raw.2559638448.seg end

//moon.fidx

//每篇文档号对应文档内分出来的    分词 DocID

都会 2391

使   2391

那些 2391

拥有 2391

它   2391

的   2391

人   2391

的   2391

视野 2391

变   2391

窄   2391

在   2180

研究生部    2180

主页 2180

培养 2180

管理 2180

栏目 2180

下载 2180

）   2180

、   2180

关于 2180

做好 2180

年   2180

国家 2180

公派 2180

研究生 2180

项目 2180

//moon.fidx end

5.# set | grep "LANG"

LANG=en; export LANG;

sort moon.fidx > moon.fidx.sort

6. Create inverted index (termid-->docid)    //建立倒排索引

    ./CrtInvertedIdx moon.fidx.sort > sun.iidx

//sun.iidx //文件规模大概减少1/2

花工   236

花海   2103

花卉   1018 1061 1061 1061 1730 1730 1730 1730 1730 1852 949 949

花蕾   447 447

花木   1061

花呢   1430

花期   447 447 447 447 447 525

花钱   174 236

花色   1730 1730

花色品种     1660

花生   450 526

花式   1428 1430 1430 1430

花纹   1430 1430

花序   447 447 447 447 447 450

花絮   136 137

花芽   450 450

//sun.iidx end

TSESearch   CGI program for query

Snapshot    CGI program for page snapshot

<P>
author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy
</P>

posted @ 2009-12-10 22:55 学者站在巨人的肩膀上阅读(1354) | 评论 (1) | 编辑收藏

自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[4]小结

通过前面的三篇文章相信你已经对神秘的搜索引擎有了一个感性的认识，和普通的php类似的脚本语言服务器类似，通过获取前台关键字，通过字典分词，和事先建立建立好的倒排索引进行相关性分析，得出查询结构格式化输出结果。而这里的技术难点在于

1、字典的选取（事实上根据不同时代不同地方人们的语言习惯是不一样的所以说字典的最小元的取值是不同的）

2、倒排索引的建立（这里就要涉及到爬虫的抓取和索引的建立后面将重点介绍这2点，搜索引擎的效率和服务质量实效性瓶颈在这里）

3、相关性分析（对抓回来的文档分词建索引和用户关键字分词算法上要对应）

后面文章会重点介绍爬虫的抓取和索引的建立。

posted @ 2009-12-10 22:54 学者站在巨人的肩膀上阅读(1041) | 评论 (0) | 编辑收藏

自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[3]来到关键字分词及相关性分析程序

有前面注释我们可以知道查询关键字和字典文件准备好好后，将进入用户关键字分词阶段

//TSESearch.cpp中：

view plaincopy to clipboardprint?
CHzSeg iHzSeg;      //include ChSeg/HzSeg.h

//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //将get到的查询变量分词分成 "我/        爱/      你们/ 的/      格式"

vector<STRING></STRING> vecTerm;
iQuery.ParseQuery(vecTerm);     //将以"/"划分开的关键字一一顺序放入一个向量容器中

set<STRING></STRING> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

gettimeofday(&end_tv,&tz);
// search end
//搜索完毕

CHzSeg iHzSeg; //include ChSeg/HzSeg.h

//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //将get到的查询变量分词分成 "我/ 爱/ 你们/ 的/ 格式"

vector vecTerm;
iQuery.ParseQuery(vecTerm); //将以"/"划分开的关键字一一顺序放入一个向量容器中

set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

gettimeofday(&end_tv,&tz);
// search end
//搜索完毕view plaincopy to clipboardprint?
看CHzSeg 中的这个方法

看CHzSeg 中的这个方法view plaincopy to clipboardprint?
//ChSeg/HzSeg.h

//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**
* 程序翻译说明
* 进一步净化数据，转换汉字
* @access public
* @param   CDict, string 参数的汉字说明:字典，查询字符串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
    string s2="";
    unsigned int i,len;

    while (!s1.empty())
    {
        unsigned char ch=(unsigned char) s1[0];
        if(ch<128)
        { // deal with ASCII
            i=1;
            len = s1.size();
            while (i<LEN len="s1.length();" i="0;" 中文标点等非汉字字符="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
                {
                    ii=i+2; // 假定没有半个汉字
                }

                if (i==0) ii=i+2;

                // 不处理中文空格
                if (!(ch==161 && (unsigned char)s1[1]==161))
                {
                    if (i <= s1.size()) // yhf
                        // 其他的非汉字双字节字符可能连续输出
                        s2 += s1.substr(0, i) + SEPARATOR;
                    else break; // yhf
                }

                if (i <= s1.size()) // yhf
                    s1s1=s1.substr(i);
                else break;     //yhf

                continue;
            }
        }


    // 以下处理汉字串

        i = 2;
        len = s1.length();

        while(i<LEN></LEN>=176)
//    while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)
            i+=2;

        s2+=SegmentHzStrMM(dict, s1.substr(0,i));

        if (i <= len)    // yhf
            s1s1=s1.substr(i);
        else break; // yhf
    }

    return s2;
}

/**
* 程序翻译说明
* 进一步净化数据，转换汉字
* @access public
* @param CDict, string 参数的汉字说明:字典，查询字符串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;

while (!s1.empty())
{
  unsigned char ch=(unsigned char) s1[0];
  if(ch<128)
  { // deal with ASCII
   i=1;
   len = s1.size();
   while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
    {
     i=i+2; // 假定没有半个汉字
    }

if (i==0) i=i+2;

    // 不处理中文空格
    if (!(ch==161 && (unsigned char)s1[1]==161))
    {
     if (i <= s1.size()) // yhf
      // 其他的非汉字双字节字符可能连续输出
      s2 += s1.substr(0, i) + SEPARATOR;
     else break; // yhf
    }

    if (i <= s1.size()) // yhf
     s1=s1.substr(i);
    else break;  //yhf

    continue;
   }
  }

// 以下处理汉字串

i = 2;
len = s1.length();

  while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
   i+=2;

s2+=SegmentHzStrMM(dict, s1.substr(0,i));

  if (i <= len) // yhf
   s1=s1.substr(i);
  else break; // yhf
}

return s2;
}view plaincopy to clipboardprint?

view plaincopy to clipboardprint?
//Query.cpp

//Query.cppview plaincopy to clipboardprint?
<PRE class=csharp name="code">/**
* 程序翻译说明
* 将以"/"划分开的关键字一一顺序放入一个向量容器中
*
* @access public
* @param   vector<STRING></STRING> 参数的汉字说明：向量容器
* @return void
*/
void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)
{
    string::size_type idx;
    while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
        vecTerm.push_back(m_sSegQuery.substr(0,idx));
        m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);
    }
}
</PRE>
<PRE class=csharp name="code"> </PRE>
<PRE class=csharp name="code"><PRE class=csharp name="code">/**
* 程序翻译说明
* 相关性分析查询，构造结果集合setRelevantRst //瓶颈所在
*
* @access public
* @param   vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明：用户提交关键字的分词组，倒排索引映射，相关性结果集合
* @return string 0
*/
bool CQuery::GetRelevantRst
(
    vector<STRING></STRING> &vecTerm,
    map &mapBuckets,
    set<STRING></STRING> &setRelevantRst
) const
{
    set<STRING></STRING> setSRst;

    bool bFirst=true;
    vector<STRING></STRING>::iterator itTerm = vecTerm.begin();

    for ( ; itTerm != vecTerm.end(); ++itTerm )
    {

        setSRst.clear();
        copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

        map mapRstDoc;
        string docid;
        int doccnt;

        map::iterator itBuckets = mapBuckets.find(*itTerm);
        if (itBuckets != mapBuckets.end())
        {
            string strBucket = (*itBuckets).second;
            string::size_type idx;
            idx = strBucket.find_first_not_of(" ");
            strBucketstrBucket = strBucket.substr(idx);

            while ( (idx = strBucket.find(" ")) != string::npos )
            {
                docid = strBucket.substr(0,idx);
                doccnt = 0;

                if (docid.empty()) continue;

                map::iterator it = mapRstDoc.find(docid);
                if ( it != mapRstDoc.end() )
                {
                    doccnt = (*it).second + 1;
                    mapRstDoc.erase(it);
                }
                mapRstDoc.insert( pair(docid,doccnt) );

                strBucketstrBucket = strBucket.substr(idx+1);
            }

            // remember the last one
            docid = strBucket;
            doccnt = 0;
            map::iterator it = mapRstDoc.find(docid);
            if ( it != mapRstDoc.end() )
            {
                doccnt = (*it).second + 1;
                mapRstDoc.erase(it);
            }
            mapRstDoc.insert( pair(docid,doccnt) );
        }

        // sort by term frequencty
        multimap > newRstDoc;
        map::iterator it0 = mapRstDoc.begin();
        for ( ; it0 != mapRstDoc.end(); ++it0 ){
            newRstDoc.insert( pair((*it0).second,(*it0).first) );
        }

        multimap::iterator itNewRstDoc = newRstDoc.begin();
        setRelevantRst.clear();
        for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
            string docid = (*itNewRstDoc).second;

            if (bFirst==true) {
                setRelevantRst.insert(docid);
                continue;
            }

            if ( setSRst.find(docid) != setSRst.end() ){
                setRelevantRst.insert(docid);
            }
        }

        //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";
        bFirst = false;
    }
    return true;
}</PRE>
</PRE>
接下来的就是现实了，前面都只是处理数据得到 setRelevantRst 这个查询结构集合,这里就不多说了下面就和php之类的脚本语言差不多，格式化结果集合并显示出来。

view plaincopy to clipboardprint?/**   * 程序翻译说明   * 将以"/"划分开的关键字一一顺序放入一个向量容器中   *   * @access public   * @param   vector<STRING></STRING> 参数的汉字说明：向量容器   * @return void   */ void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)   {       string::size_type idx;        while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {            vecTerm.push_back(m_sSegQuery.substr(0,idx));            m_sSegQuery = m_sSegQuery.substr(idx+3);        }   } /**
* 程序翻译说明
* 将以"/"划分开的关键字一一顺序放入一个向量容器中
*
* @access public
* @param   vector 参数的汉字说明：向量容器
* @return void
*/
void CQuery::ParseQuery(vector &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
  vecTerm.push_back(m_sSegQuery.substr(0,idx));
  m_sSegQuery = m_sSegQuery.substr(idx+3);
}
}

view plaincopy to clipboardprint?
view plaincopy to clipboardprint?<PRE class=csharp name="code">/**   * 程序翻译说明   * 相关性分析查询，构造结果集合setRelevantRst //瓶颈所在   *   * @access public   * @param   vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明：用户提交关键字的分词组，倒排索引映射，相关性结果集合   * @return string 0   */ bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }</PRE> view plaincopy to clipboardprint?/**   * 程序翻译说明   * 相关性分析查询，构造结果集合setRelevantRst //瓶颈所在   *   * @access public   * @param   vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明：用户提交关键字的分词组，倒排索引映射，相关性结果集合   * @return string 0   */ bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   } /**
* 程序翻译说明
* 相关性分析查询，构造结果集合setRelevantRst //瓶颈所在
*
* @access public
* @param   vector map set 参数的汉字说明：用户提交关键字的分词组，倒排索引映射，相关性结果集合
* @return string 0
*/
bool CQuery::GetRelevantRst
(
vector &vecTerm,
map &mapBuckets,
set &setRelevantRst
) const
{
set setSRst;

bool bFirst=true;
vector::iterator itTerm = vecTerm.begin();

for ( ; itTerm != vecTerm.end(); ++itTerm )
{

setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

  map mapRstDoc;
  string docid;
  int doccnt;

  map::iterator itBuckets = mapBuckets.find(*itTerm);
  if (itBuckets != mapBuckets.end())
  {
   string strBucket = (*itBuckets).second;
   string::size_type idx;
   idx = strBucket.find_first_not_of(" ");
   strBucket = strBucket.substr(idx);

   while ( (idx = strBucket.find(" ")) != string::npos )
   {
    docid = strBucket.substr(0,idx);
    doccnt = 0;

if (docid.empty()) continue;

    map::iterator it = mapRstDoc.find(docid);
    if ( it != mapRstDoc.end() )
    {
     doccnt = (*it).second + 1;
     mapRstDoc.erase(it);
    }
    mapRstDoc.insert( pair(docid,doccnt) );

strBucket = strBucket.substr(idx+1);
}

   // remember the last one
   docid = strBucket;
   doccnt = 0;
   map::iterator it = mapRstDoc.find(docid);
   if ( it != mapRstDoc.end() )
   {
    doccnt = (*it).second + 1;
    mapRstDoc.erase(it);
   }
   mapRstDoc.insert( pair(docid,doccnt) );
  }

  // sort by term frequencty
  multimap > newRstDoc;
  map::iterator it0 = mapRstDoc.begin();
  for ( ; it0 != mapRstDoc.end(); ++it0 ){
   newRstDoc.insert( pair((*it0).second,(*it0).first) );
  }

  multimap::iterator itNewRstDoc = newRstDoc.begin();
  setRelevantRst.clear();
  for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
   string docid = (*itNewRstDoc).second;

   if (bFirst==true) {
    setRelevantRst.insert(docid);
    continue;
   }

   if ( setSRst.find(docid) != setSRst.end() ){
    setRelevantRst.insert(docid);
   }
  }

//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
bFirst = false;
}
return true;
}

接下来的就是现实了，前面都只是处理数据得到 setRelevantRst 这个查询结构集合,这里就不多说了下面就和php之类的脚本语言差不多，格式化结果集合并显示出来。
//TSESearch.cpp

view plaincopy to clipboardprint?
//下面开始显示
    CDisplayRst iDisplayRst;
    iDisplayRst.ShowTop();

    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;

    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
            setRelevantRst.size(), iQuery.m_iStart);

    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

posted @ 2009-12-10 22:53 学者站在巨人的肩膀上阅读(1036) | 评论 (0) | 编辑收藏

学着站在巨人的肩膀上

公告

常用链接

留言簿(1)

随笔分类

随笔档案

搜索

最新评论

阅读排行榜

评论排行榜