[算法]找出m个数中最小的n个数

这个问题属于常见问题了，我的办法是采用堆。

截取STL中的partial_sort算法的实现：

template <class RandomAccessIterator, class T, class Compare>

void __partial_sort(RandomAccessIterator first, RandomAccessIterator middle,

RandomAccessIterator last, T*, Compare comp) {

make_heap(first, middle, comp);

for (RandomAccessIterator i = middle; i < last; ++i)

if (comp(*i, *first))

__pop_heap(first, middle, i, T(*i), comp, distance_type(first));

sort_heap(first, middle, comp);

}

上面这个函数是对一个序列进行部分排序，排序之后，在first，middle之间的元素有序。
它的做法首先将原始的first，middle中的数据生成一个堆，然后遍历之后的数据，如果比前面的元素中最大的元素大，就把最小的元素删除插入这个新的元素，这个过程是lg(middle - first），最后进行堆排序。

不是每个元素都需要重新调整堆，所以最坏的情况时间复杂度是n * lg(middle - first)。

类似的，可以根据这个算法并且使用堆算法解决这个问题，如上所言，最坏的时间复杂度是n * lg(m)，其中n是元素总量，m是待查找的最大的m个数。

根据我上面的这个想法，还有我原来写过的堆算法，给出解决这个问题的代码，由于我原来写的堆算法是大项堆(max-heap)，也就是堆中最大的元素在根部，所以这里面求出来的是数中最小的10个数，如果要求最大的10个数，要讲这个堆改成小项堆，其实也就是改变堆中父子之间的大小关系罢了，其他的地方不变。

/********************************************************************
    created:    2007/3/18
    filename:     main.cpp
    author:        Lichuang

    purpose:    测试模拟堆算法
*********************************************************************/

#include <algorithm>
#include <iostream>
#include <time.h>

using namespace std;

// push_heap为向堆中添加一个新的元素, 调用这个算法的前提是[First, Last)之间的元素满足堆的条件
// 新加入的元素为Last
void    push_heap(int* pFirst, int* pLast);

// pop_heap为从堆中删除一个元素, 调用这个算法的前提是[First, Last)之间的元素满足堆的条件
// 被删除的元素被放置到Last - 1位置,由于这里是max-heap,所以被删除的元素是这个序列中最大的元素
void    pop_heap(int* pFirst, int* pLast);

// make_heap将序列[First, Last)中的元素按照堆的性质进行重组
void    make_heap(int* pFirst, int* pLast);

// 对堆进行排序, 调用这个函数可以成功排序的前提是[pFirst, pLast)中的元素符合堆的性质
void    sort_heap(int* pFirst, int* pLast);

// 判断一个序列[First, Last)是否满足堆的条件,是就返回1,否则返回0
char    is_heap(int* pFirst, int* pLast);

// 用于根据堆的性质调整堆, 将nValue放到位置nHoleIndex, 并且保证堆性质的成立
void   adjust_heap(int *pFirst, int nHoleIndex, int nLen, int nValue);

// 得到一个数组中最小的n个元素
void   get_n_min(int *pArray, int nLen, int n);

void    display_array(int *pArray, int nLength);

int main()
{
   srand(time(NULL));
    int Array[10];
   for(int i = 0; i < 10; ++i)
       Array[i] = rand();

   get_n_min(Array, 10, 2);

    return 0;
}

void get_n_min(int *pArray, int nLen, int n)
{
   int *pTmp = (int*)malloc(sizeof(int) * n);
   if (NULL == pTmp)
   {
       perror("malloc error");
       return;
   }

   int i;
   for (i = 0; i < n; ++i)
       pTmp[i] = pArray[i];
   make_heap(pTmp, pTmp + n);
   display_array(pTmp, n);

   for (; i < nLen; ++i)
   {
       if (pArray[i] < pTmp[0])
           adjust_heap(pTmp, 0, n, pArray[i]);
   }

   // 最后对堆进行排序
   sort_heap(pTmp, pTmp + n);

   cout << "the min n elements of the array is:\n";

   // 打印堆中的数据
   display_array(pTmp, n);

   free(pTmp);
}

// push_heap为向堆中添加一个新的元素, 调用这个算法的前提是[First, Last)之间的元素满足堆的条件
// 新加入的元素为Last
void push_heap(int* pFirst, int* pLast)
{
    int nTopIndex, nHoleIndex, nParentIndex;
    int nValue;

    nTopIndex = 0;
    nHoleIndex = (int)(pLast - pFirst - 1);
    nParentIndex = (nHoleIndex - 1) / 2;
    nValue = *(pLast - 1);
    // 如果需要插入的节点值比父节点大, 上溯继续查找
    while (nHoleIndex > nTopIndex && pFirst[nParentIndex] < nValue)
    {
        pFirst[nHoleIndex] = pFirst[nParentIndex];
        nHoleIndex = nParentIndex;
        nParentIndex = (nHoleIndex - 1) / 2;
    }
    pFirst[nHoleIndex] = nValue;
}

// pop_heap为从堆中删除一个元素, 调用这个算法的前提是[First, Last)之间的元素满足堆的条件
// 被删除的元素被放置到Last - 1位置,由于这里是max-heap,所以被删除的元素是这个序列中最大的元素
void pop_heap(int* pFirst, int* pLast)
{
    int nValue;

    nValue = *(pLast - 1);
    *(pLast - 1) = *pFirst;
    adjust_heap(pFirst, 0, (int)(pLast - pFirst - 1), nValue);
}

// make_heap将序列[First, Last)中的元素按照堆的性质进行重组
void make_heap(int* pFirst, int* pLast)
{
    int nLen, nParentIndex;

    nLen = (int)(pLast - pFirst);
    nParentIndex = (nLen - 1) / 2;

    while (true)
    {
        // 对父节点进行调整, 把父节点的值调整到合适的位置
        adjust_heap(pFirst, nParentIndex, nLen, pFirst[nParentIndex]);
        if (0 == nParentIndex)
            return;
        nParentIndex--;
    }
}

// 对堆进行排序, 调用这个函数可以成功排序的前提是[pFirst, pLast)中的元素符合堆的性质
void sort_heap(int* pFirst, int* pLast)
{
    // 调用pop_heap函数, 不断的把当前序列中最大的元素放在序列的最后
    while(pLast - pFirst > 1)
        pop_heap(pFirst, pLast--);
}

// 判断一个序列[First, Last)是否满足堆的条件,是就返回1,否则返回0
char is_heap(int* pFirst, int* pLast)
{
    int nLen, nParentIndex, nChildIndex;

    nLen = (int)(pLast - pFirst);
    nParentIndex = 0;
    for (nChildIndex = 1; nChildIndex < nLen; ++nChildIndex)
    {
        if (pFirst[nParentIndex] < pFirst[nChildIndex])
            return 0;

        // 当nChildIndex是偶数时, 那么父节点已经和它的两个子节点进行过比较了
        // 将父节点递增1
        if ((nChildIndex & 1) == 0)
            ++nParentIndex;
    }

    return 1;
}

// 一个静态函数仅供adjust_heap调用以证实JJHOU的结论
static void push_heap(int *pFirst, int nHoleIndex, int nTopIndex, int nValue)
{
    int nParentIndex;

    nParentIndex = (nHoleIndex - 1) / 2;
    while (nHoleIndex > nTopIndex && pFirst[nParentIndex] < nValue)
    {
        pFirst[nHoleIndex] = pFirst[nParentIndex];
        nHoleIndex = nParentIndex;
        nParentIndex = (nHoleIndex - 1) / 2;
    }
    pFirst[nHoleIndex] = nValue;
}

// 对堆进行调整, 其中nHoleIndex是目前堆中有空洞的节点索引, nLen是待调整的序列长度
// nValue是需要安插进入堆中的值
void adjust_heap(int *pFirst, int nHoleIndex, int nLen, int nValue)
{
    int nTopIndex, nSecondChildIndex;

    nTopIndex = nHoleIndex;
    nSecondChildIndex = 2 * nTopIndex + 2;
    while (nSecondChildIndex < nLen)
    {
        if (pFirst[nSecondChildIndex] < pFirst[nSecondChildIndex - 1])
            --nSecondChildIndex;
        pFirst[nHoleIndex] = pFirst[nSecondChildIndex];
        nHoleIndex = nSecondChildIndex;
        nSecondChildIndex = 2 * nHoleIndex + 2;
    }
    if (nSecondChildIndex == nLen)
    {
        pFirst[nHoleIndex] = pFirst[nSecondChildIndex - 1];
        nHoleIndex = nSecondChildIndex - 1;
    }

    // 以下两个操作在这个函数中的作用相同, 证实了<<STL源码剖析>>中P178中JJHOU所言
    //pFirst[nHoleIndex] = nValue;
    push_heap(pFirst, nHoleIndex, nTopIndex, nValue);
}

void    display_array(int *pArray, int nLength)
{
    for (int i = 0; i < nLength; ++i)
        std::cout << pArray[i] << " ";
    std::cout << std::endl;
}

posted on 2007-11-26 18:54 那谁阅读(4231) 评论(2) 编辑收藏引用所属分类: 算法与数据结构

# re: [算法]找出n个中最大的m个数回复 更多评论

不错，我所知道的另外一个算法，也是N*lg(M),似乎这就是最快了的吧。

2007-11-27 13:36 | tangl_99

# re: [算法]找出n个中最大的m个数 回复 更多评论

只是找10个数而已，不用这么复杂吧，不如直接用有序链表算了，构建和维护成本比堆的成本小多了。要是要找的数量偏大，用上堆还差不多。

2007-11-27 16:39 | www.helpsoff.com.cn

刷新评论列表

只有注册用户登录后才能发表评论。


相关文章: [算法]如何根据数据的多种属性来查找数据 Btree算法实现代码二分查找学习札记把二分查找算法写正确需要注意的地方在一个有序序列中查找重复/不存在的数自己实现的memcpy 另类的链表数据结构以及算法 memcached内存管理算法二分查找算法(迭代和递归版本) ccache发布0.5版本

网站导航: 博客园博客园最新博文博问管理

# re: [算法]找出n个中最大的m个数回复 更多评论

# re: [算法]找出n个中最大的m个数 回复 更多评论

那谁的技术博客