C＋＋/D/Python性能比较续

周末抽空做了点小测试，根据http://blog.vckbase.com/jzhang/archive/2006/03/28/18807.html中m网友修改的算法，python版本中读取所有行以后就做一个排序，再去除重复项。这个算法在我的机器上执行时间是1735ms左右，属于python版本中最快的一个。

D版本暂还没想到有更优化的做法，D在处理以char[]作key的关联数组时，判断方法是先判断hash，如果hash相等，则继续做字符串判断。它执行时间是1120ms左右。

以D版本为基础，自己写了一个C＋＋的Email类：

class Email
{
private:
        string mail;
        size_t hash;
        friend bool operator < (const Email& lhs, const Email& rhs);
public:
        Email (const char* mail_)
                : mail(mail_), hash(my_hash(mail_))
        {
        }
};

bool operator < (const Email& lhs, const Email& rhs)
{
        if (lhs.hash == rhs.hash)
                return lhs.mail < rhs.mail;
        return lhs.hash < rhs.hash;
}

把它插入set并判断是否有重复。

这个程序由于string的大量拷贝，以及大量内存分配，执行时间相当长，在我的机器上是5s左右。D和python版本由于对象拷贝成本较低，加上都有内存分配策略，自然有一些优势。

退而求其次，既然hash冲突的几率较低，试试只保存hash：

class Email
{
private:
        size_t hash;
        friend bool operator < (const Email& lhs, const Email& rhs);
public:
        Email (const char* mail_)
                : hash(my_hash(mail_))
        {
        }
};

bool operator < (const Email& lhs, const Email& rhs)
{
        return lhs.hash < rhs.hash;
}

这次测试就比较快了，耗时仅1020ms左右，比D版本还要快，当然它不是完善的版本。

考虑到构造成本，于是改为只用一个set<int>来保存hash值再测试，这次耗时是930ms。

实际上可以做一个改进的C＋＋版本，一次性读入文件的全部内容到一个大缓冲区，把所有的\n字符修改为\0，用一个动态数组保存缓冲区的所有字符串指针，hash值也须计算并保存到数组。再用D的索引方式，先hash比较，再字符串比较，效率应该也不低。

实现了一个：

#include <iostream>
#include <string>
#include <set>
#include <fstream>
#include <iterator>
#include <sys/time.h>
using namespace std;

size_t my_hash (const char* str)
{
        size_t ret = 0;
        while (*str)
                ret = 11 * ret + *str++;
        return ret;
}

class Email
{
private:
        size_t hash;
        const char* mail;
        friend bool operator < (const Email& lhs, const Email& rhs);
public:
        Email (const char* mail_)
                : hash(my_hash(mail_)), mail(mail_)
        {
        }
};

bool operator < (const Email& lhs, const Email& rhs)
{
        if (lhs.hash == rhs.hash)
                return strcmp(lhs.mail, rhs.mail) < 0;
        return lhs.hash < rhs.hash;
}

int main(int argc, char** argv)
{
        if (argc < 3)
        {
                cout << "Wrong arguments" << endl;
                return 1;
        }

        FILE* fin = fopen(argv[1], "r");
        if (!fin)
        {
                cout << "Invalid input file" << endl;
                return 2;
        }
        FILE* fout = fopen(argv[2], "w");
        if (!fout)
        {
                fclose(fin);
                cout << "Invalid output file" << endl;
                return 3;
        }

        timeval start, end;

        const int BUF_SIZE = 20 * 1024 * 1024;
        char* buffer = new char[BUF_SIZE];
        memset(buffer, 0, BUF_SIZE);

        gettimeofday(&start, 0);
        set<Email> emails;

        size_t read = fread (buffer, BUF_SIZE, 1, fin);
        char* begin = buffer;
        char* current = buffer;

        while (*current != '\0')
        {
                if (*current == '\n')
                {
                        *current = '\0';
                        if (emails.insert(begin).second){
                                fputs(begin, fout);
                                fwrite("\n", 1, 1, fout);
                        }
                        begin = current + 1;
                }
                ++ current;
        }

        fclose(fout);
        fclose(fin);

        gettimeofday(&end, 0);

        printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) / 1000));

        delete[] buffer;
        return 0;
}

memset不是必须的，不过我不知道如何获取读入的大小。fread读取后，如果读到EOF，则返回值为0。所以我这里用memset先初始化内存，但不把它计入耗时。new操作也没计入，因为其它语言比如python、D在启动时都由运行时做了类似工作。

这个程序在我的机器上耗时为1350ms左右。我想可慢在set上，对象拷贝？内存分配？

做了几个优化版本，没多大提高。

重新测试了下：
A、python(m网友版本）：
lijie t # python test.py
1689.0411377
lijie t # python test.py
1711.40599251
lijie t # python test.py
1699.63312149
lijie t # python test.py
1712.00013161
lijie t # python test.py
1713.8838768

B、D版本：
lijie t # ./testd email.txt email-new.txt
1091
lijie t # ./testd email.txt email-new.txt
1070
lijie t # ./testd email.txt email-new.txt
1062
lijie t # ./testd email.txt email-new.txt
1062
lijie t # ./testd email.txt email-new.txt
1096

C、C＋＋只比较hash，set<Email>版本：
lijie t # ./test3 email.txt email-new.txt
Time used: 981 ms
lijie t # ./test3 email.txt email-new.txt
Time used: 1000 ms
lijie t # ./test3 email.txt email-new.txt
Time used: 980 ms
lijie t # ./test3 email.txt email-new.txt
Time used: 986 ms
lijie t # ./test3 email.txt email-new.txt
Time used: 987 ms

D、C＋＋只比较hash，set<int>版本：
lijie t # ./test4 email.txt email-new.txt
Time used: 951 ms
lijie t # ./test4 email.txt email-new.txt
Time used: 953 ms
lijie t # ./test4 email.txt email-new.txt
Time used: 947 ms
lijie t # ./test4 email.txt email-new.txt
Time used: 950 ms
lijie t # ./test4 email.txt email-new.txt
Time used: 962 ms

E、C＋＋大缓冲区，比较hash和字符串，set<Email>版本：
lijie t # ./test5 email.txt email-new.txt
Time used: 1375 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1359 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1369 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1378 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1396 ms

F、C＋＋大缓冲区，比较字符串版本：
lijie t # ./test6 email.txt email-new.txt
Time used: 1168 ms
lijie t # ./test6 email.txt email-new.txt
Time used: 1169 ms
lijie t # ./test6 email.txt email-new.txt
Time used: 1171 ms
lijie t # ./test6 email.txt email-new.txt
Time used: 1179 ms
lijie t # ./test6 email.txt email-new.txt
Time used: 1169 ms

从C、E和F来看，对象拷贝成本是比较高的，E版本仅仅比C版本多了个const char*成员变量，hash值比较散，很少会真的执行到strcmp。保持E版本对象结构不变，把operator <里面的实现改为C版本，测试结果如下：
lijie t # ./test5 email.txt email-new.txt
Time used: 1355 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1360 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1348 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1353 ms
lijie t # ./test5 email.txt email-new.txt
Time used: 1379 ms

效率只提高了一点点，这个版本仅仅比C版本多了个成员变量拷贝，竟然慢了这么多。说明Email对象的2个成员变量拷贝成本的确很高。

F版本相比之下反而效率很不错，主要原因是email数据不够复杂，仅通过前几位就可以比较出结果。如果每行数据比较长，而且很多行要到后几个字符才能比较出来，肯定就不那么快了。

hash值的计算虽然执行了一系列乘法，不过还是相当迅速。

D语言版本执行了hash值和字符串比较，是比较完善的，效率很不错。C＋＋相应版本看来要提高set的效率才能达到。

jzhang的第一个python版本在我的机器上执行如下：
lijie t # python test2.py
3122.9569912 ms
lijie t # python test2.py
3209.42997932 ms
lijie t # python test2.py
3141.47305489 ms
lijie t # python test2.py
3129.57286835 ms
lijie t # python test2.py
3196.03514671 ms

我做了点修改，执行速度提高了一些：

#remove duplicated email address from file
import datetime
from time import time
if __name__ == "__main__":
start = time()
hashtable = {}
f = file("email.txt","r")
f2 = file("email_new.txt","w")
for line in f.xreadlines():
  if not hashtable.has_key(line):
   hashtable[line] = 1
   f2.write(line)
f.close()
f2.close()
print (time() - start) * 1000, "ms"

在我的机器上执行结果如下：
lijie t # python test1.py
2239.22801018 ms
lijie t # python test1.py
2301.00703239 ms
lijie t # python test1.py
2282.06086159 ms
lijie t # python test1.py
2296.57006264 ms
lijie t # python test1.py
2281.25810623 ms

不过还是没有m网友的效率高。

在F版本的基础上，借鉴m网友的做法，实现一个G版本：

G、排序并去除重复元素，比较hash和字符串版本：

#include <iostream>
#include <string>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <vector>
using namespace std;

size_t my_hash (const char* str)
{
        size_t ret = 0;
        while (*str)
                ret = 11 * ret + *str++;
        return ret;
}

class Email
{
private:
        size_t hash;
        const char* mail;
        friend bool operator < (const Email& lhs, const Email& rhs);
public:
        Email (const char* mail_)
                : hash(my_hash(mail_)), mail(mail_)
        {
        }

        bool operator == (const Email& rhs)
        {
                if (hash == rhs.hash)
                        return strcmp(mail, rhs.mail) == 0;
                return false;
        }

        const char* getEmail()const
        {
                return mail;
        }
};

bool operator < (const Email& lhs, const Email& rhs)
{
        if (lhs.hash == rhs.hash)
                return strcmp(lhs.mail, rhs.mail) < 0;
        return lhs.hash < rhs.hash;
}

int main(int argc, char** argv)
{
        if (argc < 3)
        {
                cout << "Wrong arguments" << endl;
                return 1;
        }

        FILE* fin = fopen(argv[1], "r");
        if (!fin)
        {
                cout << "Invalid input file" << endl;
                return 2;
        }
        FILE* fout = fopen(argv[2], "w");
        if (!fout)
        {
                fclose(fin);
                cout << "Invalid output file" << endl;
                return 3;
        }

        timeval start, end;

        const int BUF_SIZE = 20 * 1024 * 1024;
        char* buffer = new char[BUF_SIZE];
        memset(buffer, 0, BUF_SIZE);

        gettimeofday(&start, 0);
        vector<Email> emails;

        size_t read = fread (buffer, BUF_SIZE, 1, fin);
        char* begin = buffer;
        char* current = buffer;

        while (*current != '\0')
        {
                if (*current == '\n')
                {
                        *current = '\0';
                        emails.push_back(begin);
                        begin = current + 1;
                }
                ++ current;
        }
        fclose(fin);
        sort(emails.begin(), emails.end());
        emails.erase (unique( emails.begin(), emails.end() ), emails.end());

        for (vector<Email>::const_iterator iter = emails.begin();
             iter != emails.end();
             iter ++)
        {
                fputs((*iter).getEmail(), fout);
                fwrite("\n", 1, 1, fout);
        }

        fclose(fout);

        gettimeofday(&end, 0);

        printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) / 1000));

        delete[] buffer;

        return 0;
}

在我的机器上执行如下：
lijie t # ./test7 email.txt email-new.txt
Time used: 676 ms
lijie t # ./test7 email.txt email-new.txt
Time used: 675 ms
lijie t # ./test7 email.txt email-new.txt
Time used: 671 ms
lijie t # ./test7 email.txt email-new.txt
Time used: 669 ms
lijie t # ./test7 email.txt email-new.txt
Time used: 673 ms

比F版本快了2倍，也快过了其它所有版本。不过由于数据是vector保存的，在数据大量重复的情况下，性能可能会有较大的降低。

把operator<和operator==的实现改为strcmp比较，执行结果如下：
lijie t # ./test8 email.txt email-new.txt
Time used: 1275 ms
lijie t # ./test8 email.txt email-new.txt
Time used: 1267 ms
lijie t # ./test8 email.txt email-new.txt
Time used: 1297 ms
lijie t # ./test8 email.txt email-new.txt
Time used: 1296 ms
lijie t # ./test8 email.txt email-new.txt
Time used: 1271 ms

修改了下，增加了计时，修正了fread使用错误。

#include <iostream>
#include <string>
#include <fstream>
#include <iterator>
#include <vector>
using namespace std;

#ifdef _WIN32
# include <windows.h>
#else // _WIN32
# include <sys/time.h>
#endif // _WIN32

size_t my_hash (const char* str)
{
        size_t ret = 0;
        while (*str)
                ret = 11 * ret + *str++;
        return ret;
}

class Email
{
private:
        size_t hash;
        const char* mail;
        friend bool operator < (const Email& lhs, const Email& rhs);
public:
        Email (const char* mail_)
                : hash(my_hash(mail_)), mail(mail_)
        {
        }

        bool operator == (const Email& rhs)
        {
                if (hash == rhs.hash)
                        return strcmp(mail, rhs.mail) == 0;
                return false;
        }

        const char* getEmail()const
        {
                return mail;
        }
};

bool operator < (const Email& lhs, const Email& rhs)
{
        if (lhs.hash == rhs.hash)
                return strcmp(lhs.mail, rhs.mail) < 0;
        return lhs.hash < rhs.hash;
}

#ifndef _WIN32
class Timer
{
        timeval begin, end;
public:
        void start () {gettimeofday(&begin, 0);}
        void stop () {gettimeofday(&end, 0);}
        size_t milliseconds () const {
                return (end.tv_sec - begin.tv_sec) * 1000 + (end.tv_usec - begin.tv_usec) / 1000;
        }
};
#else // _WIN32
class Timer
{
        DWORD begin, end;
public:
        void start () {begin = GetTickCount();}
        void stop () {end = GetTickCount();}
        size_t milliseconds () const {
                return end - begin;
        }
};
#endif // _WIN32

int main(int argc, char** argv)
{
        if (argc < 3)
        {
                cout << "Wrong arguments" << endl;
                return 1;
        }

        for (int i=0; i<10; ++i) {

       FILE* fin = fopen(argv[1], "r");
        if (!fin)
        {
                cout << "Invalid input file" << endl;
                return 2;
        }
        FILE* fout = fopen(argv[2], "w");
        if (!fout)
        {
                fclose(fin);
                cout << "Invalid output file" << endl;
                return 3;
        }

        Timer total, part;
        total.start();
        part.start();

        const int BUF_SIZE = 20 * 1024 * 1024;

        char* buffer = new char[BUF_SIZE];
        if (!buffer){
                // cout << "Alloc buffer failed" << endl;
                return 4;
        }

        part.stop();
        // cout << "Alloc buffer, " << part.milliseconds() << " ms used." << endl;
        part.start();

        size_t read = fread (buffer, 1, BUF_SIZE, fin);
        fclose(fin);
        buffer[read] = '\0';
        part.stop();
        // cout << "Read file, " << part.milliseconds() << " ms used." << endl;
        part.start();

        vector<Email> emails;

        char* begin = buffer;
        char* current = buffer;

        while (*current != '\0')
        {
                if (*current == '\n')
                {
                        *current = '\0';
                        emails.push_back(begin);
                        begin = current + 1;
                }
                ++ current;
        }

        part.stop();
        // cout << "Put emails into vector, " << part.milliseconds() << " ms used." << endl;
        part.start();

        sort(emails.begin(), emails.end());
        part.stop();
        // cout << "Sort emails, " << part.milliseconds() << " ms used." << endl;
        part.start();

        emails.erase (unique( emails.begin(), emails.end() ), emails.end());
        part.stop();
        // cout << "Unique emails, " << part.milliseconds() << " ms used." << endl;
        part.start();

        for (vector<Email>::const_iterator iter = emails.begin();
             iter != emails.end();
             iter ++)
        {
                fputs((*iter).getEmail(), fout);
                fwrite("\n", 1, 1, fout);
        }

        fclose(fout);

        part.stop();
        // cout << "Write emails into new file, " << part.milliseconds() << " ms used." << endl;

        total.stop();

        cout << "Total used: " << total.milliseconds() << " ms." << endl;

        delete[] buffer;
        }

        return 0;
}

使用“-O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4”选项编译，耗时从680ms减少到620ms

优化文件读写版：

#include <iostream>
#include <string>
#include <fstream>
#include <iterator>
#include <vector>
using namespace std;

// config
#define USE_CACHE
//#define PROFILE
// end config

#ifdef _WIN32
# include <windows.h>
#else // _WIN32
# include <sys/time.h>
#endif // _WIN32

#ifndef _WIN32
class Timer
{
    timeval begin, end;
public:
    void start () {gettimeofday(&begin, 0);}
    void stop () {gettimeofday(&end, 0);}
    size_t milliseconds () const {
        return (end.tv_sec - begin.tv_sec) * 1000 + (end.tv_usec - begin.tv_usec) / 1000;
    }
};
#else // _WIN32
class Timer
{
    DWORD begin, end;
public:
    void start () {begin = GetTickCount();}
    void stop () {end = GetTickCount();}
    size_t milliseconds () const {
        return end - begin;
    }
};
#endif // _WIN32

#ifdef PROFILE
# define PROFILE_OUTPUT(timer,info) \
    timer.stop(); \
    cout << info << ": " << timer.milliseconds() << " ms used." << endl; \
    timer.start()
#else // PROFILE
# define PROFILE_OUTPUT(timer,info)
#endif // PROFILE

size_t my_hash (const char* str)
{
    size_t ret = 0;
    while (*str)
            ret = 11 * ret + *str++;
    return ret;
}

class Email
{
private:
    size_t hash;
    const char* mail;
    friend bool operator < (const Email& lhs, const Email& rhs);
public:
    Email (const char* mail_)
        : hash(my_hash(mail_)), mail(mail_)
    {
    }

    bool operator == (const Email& rhs)
    {
        if (hash == rhs.hash)
            return strcmp(mail, rhs.mail) == 0;
        return false;
    }

    const char* getEmail() const
    {
        return mail;
    }

    size_t getLength() const
    {
        return strlen(mail);
    }
};

bool operator < (const Email& lhs, const Email& rhs)
{
    if (lhs.hash == rhs.hash)
        return strcmp(lhs.mail, rhs.mail) < 0;
    return lhs.hash < rhs.hash;
}

#ifdef USE_CACHE
class OfstreamBuffer
{
    ofstream& ofs;
    size_t buf_size;
    size_t offset;
    char* buffer;

public:
    OfstreamBuffer (ofstream& ofs_, size_t buf_size_)
        : ofs(ofs_), buf_size(buf_size_), offset(0)
    {
        buffer = new char[buf_size_];
    }

    ~OfstreamBuffer ()
    {
        flush();
        delete[] buffer;
    }

    void write (const char* ptr, size_t size)
    {
        while (size > 0)
        {
            size_t copy_size = buf_size - offset;
            if (copy_size > size)
                copy_size = size;
            memcpy (buffer + offset, ptr, copy_size);
            offset += copy_size;
            ptr += copy_size;
            size -= copy_size;

            if (offset == buf_size)
                flush ();
        }
    }

    void flush ()
    {
        if (offset > 0)
        {
            ofs.write(buffer, offset);
            offset = 0;
        }
        ofs.flush();
    }
};
#else// USE_CACHE
class OfstreamBuffer
{
    ofstream& ofs;

public:
    OfstreamBuffer (ofstream& ofs_, size_t buf_size_)
        : ofs(ofs_)
    {
    }

    void write (const char* ptr, size_t size)
    {
        ofs.write(ptr, size);
    }

    void flush ()
    {
        ofs.flush();
    }
};
#endif // USE_CACHE

int main(int argc, char** argv)
{
    if (argc < 3)
    {
        cout << "Wrong arguments" << endl;
        return 1;
    }

    for (int i=0; i<1; ++i) {

        ifstream ifs(argv[1], ios::binary);
        if (!ifs)
        {
            cout << "Invalid input file" << endl;
            return 2;
        }

        ofstream ofs(argv[2], ios::binary);
        if (!ofs)
        {
            cout << "Invalid output file" << endl;
            return 3;
        }

        Timer total, part;
        total.start();
        part.start();

        ifs.seekg(0, ios_base::end);
        size_t file_size = (size_t)ifs.tellg() + 1;
        cout << "file size: " << file_size << endl;
        ifs.seekg(0, ios_base::beg);

        char* buffer = new char[file_size];
        if (!buffer){
            cout << "Alloc buffer failed" << endl;
            return 4;
        }

        PROFILE_OUTPUT(part, "Alloc buffer");

        ifs.read(buffer, file_size);
        buffer[file_size] = '\0';

        PROFILE_OUTPUT(part, "Read file");

        vector<Email> emails;
        emails.reserve(1000000);

        char* begin = buffer;
        char* current = buffer;

        while (*current != '\0')
        {
            if (*current == '\n')
            {
                *current = '\0';
                emails.push_back(begin);
                begin = current + 1;
            }
            ++ current;
        }

        PROFILE_OUTPUT(part, "Put emails into vector");

        sort(emails.begin(), emails.end());

        PROFILE_OUTPUT(part, "Sort emails");

        emails.erase (unique( emails.begin(), emails.end() ), emails.end());

        PROFILE_OUTPUT(part, "Unique emails");

        OfstreamBuffer ofsBuffer (ofs, 4 * 1024);

        for (vector<Email>::const_iterator iter = emails.begin();
             iter != emails.end();
             iter ++ )
        {
            ofsBuffer.write(iter->getEmail(), iter->getLength());
            ofsBuffer.write("\n", 1);
        }
        ofsBuffer.flush();

        PROFILE_OUTPUT(part, "Write emails into new file");

        total.stop();
        cout << "Total used: " << total.milliseconds() << " ms." << endl;

        delete[] buffer;
    }

    return 0;
}

这个版本在windows上用dev-cpp所带的gcc 3.4.2来编译，最好成绩是609ms。在cygwin里可以达到650ms。

posted on 2006-04-03 11:00 qiezi 阅读(2826) 评论(27) 编辑收藏引用所属分类: C＋＋、杂谈

qiezi的学习园地

常用链接

随笔分类(113)

随笔档案(85)

Java相关

Ruby相关

收藏文章

收藏站点

优秀blog集锦

中国名菜

最新评论