出现最频繁的K个单词
2022-10-19 14:57:11 # 算法 # 数据结构 #

1. 问题描述

给定一个单词流,单词个数未知的。现在需要设计一种结构,它可以接收一个单词,也可以返回到目前为止,出现次数最多的K个单词。用户会在某些时候要求查询出现次数最多的K个单词,按照出现次数升序,字典序降序的顺序输出。

K通过构造方法传入,K>0

测评链接:https://www.lintcode.com/problem/550/

2. 暴力解法

2.1. 算法分析

  1. 使用list存储每一个单词及其出现次数
  2. 使用哈希表存储每一个单词内容和单词本身的映射。
  3. 对于每次添加的操作,先在哈希表中查找是否存在单词,如果存在则获取其引用,增加其出现从次数,如果不存在,则新建一个单词的引用,出现次数是1,添加到list和哈希表。
  4. 对于每次获取topk操作,先对list按照出现次数从大到小,字典序从大到小排序,取前K个单词即可。

2.2. 代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import java.util.*;

public class KFrequentWords {

private final int k;

private ArrayList<WordInfo> data;

private HashMap<String,WordInfo> wordMap;

private final Comparator<WordInfo> cmp = (a,b) -> a.cnt != b.cnt ? b.cnt - a.cnt : b.word.compareTo(a.word);

public KFrequentWords(int k) {
this.k = k;
data = new ArrayList<>();
wordMap = new HashMap<>();
}

public void add(String word) {
WordInfo wi = wordMap.get(word);
if(wi == null) {
wi = new WordInfo(word);
wordMap.put(word,wi);
data.add(wi);
}else ++wi.cnt;
}

public List<String> getTopKWords() {
data.sort(cmp);
int end = Math.min(k,data.size()) - 1;
List<String> ans = new ArrayList<>(end + 1);
for(int i = 0; i <= end; ++i)
ans.add(data.get(i).word);
return ans;
}

static class WordInfo {

private int cnt;

private String word;

public WordInfo(String word) {
cnt = 1;
this.word = word;
}
}

public static void main(String[] args) {
KFrequentWords kfw = new KFrequentWords(3);
kfw.add("hello");
kfw.add("yes");
kfw.add("C++");
System.out.println(kfw.getTopKWords());// yes,hello,C++
kfw.add("yes");
kfw.add("yes");
kfw.add("yes");
System.out.println(kfw.getTopKWords());// yes,hello,C++
kfw.add("C++");
kfw.add("C++");
kfw.add("C++");
System.out.println(kfw.getTopKWords());// yes,C++,hello
}

}

3. 最优解

3.1. 算法分析

  1. 还是使用哈希表存储每一个单词内容和其本身的映射;

  2. 使用一个长度为K的小根堆结构,维护出现次数最多且字典序最大的K个单词;还需要一个哈希表,告诉一个元素是否在堆上;

  3. 当新来一个单词的时候,如果不在map中那么新建出来,加入map,否则当前单词的出现次数++;

    • 如果当前单词在堆上,那么调整堆(此时只需要heapify,因为出现次数增加,又是小根堆)
    • 如果当前单词不在堆上,
      • 如果堆size没有达到K,那么加入堆中,调整堆(heapInsert),
      • 如果堆size达到K,然后看看它能否替换堆顶,如果可以则替换堆顶,调整堆
  4. 当需要获取topK的时候,只需要取出堆中的所有元素,排序即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import java.util.*;

public class KFrequentWords2 {

private final int k;

private final WordInfo[] heap, help;

private final HashMap<String,WordInfo> wordMap;

private final HashMap<WordInfo,Integer> indexMap;

private int size;

private final Comparator<WordInfo> heapCmp = (a,b) -> a.cnt != b.cnt ? a.cnt - b.cnt : a.word.compareTo(b.word);

private final Comparator<WordInfo> sortCmp = (a,b) -> a.cnt != b.cnt ? b.cnt - a.cnt : b.word.compareTo(a.word);

public KFrequentWords2(int k) {
this.k = k;
heap = new WordInfo[k];
help = new WordInfo[k];
wordMap = new HashMap<>();
indexMap = new HashMap<>();
}

public void add(String word) {
WordInfo wi = wordMap.get(word);
if(wi == null) {// 不在Map则新建
wi = new WordInfo(word);
wordMap.put(word,wi);
}else ++wi.cnt;
Integer index = indexMap.get(wi);
if(index == null) {// 不在堆上
if(size < k) {// 堆不满,则放入堆中
heap[size] = wi;
indexMap.put(wi,size);
heapInsert(size++);
}else {// 堆满,则看看能否替换堆顶
if(heapCmp.compare(heap[0],wi) < 0) {
indexMap.put(wi,0);
indexMap.remove(heap[0]);
heap[0] = wi;
heapIfy(0);
}
}
}else heapIfy(index); // 在堆上,则调整堆
}

public List<String> getTopKWords() {
int len = Math.min(k,size);
List<String> ans = new ArrayList<>(len);
for(int i = 0; i < len; ++i) help[i] = heap[i];
Arrays.sort(help,0,len,sortCmp);
for(int i = 0; i < len; ++i) ans.add(help[i].word);
return ans;
}

private void heapInsert(int index) {
int parent;
while(heapCmp.compare(heap[index],heap[parent = (index - 1) / 2]) < 0) {
swap(index,parent);
index = parent;
}
}

private void heapIfy(int index) {
int child, bnd = size >> 1;
while(index < bnd) {
child = (index << 1) | 1;
if(child + 1 < size && heapCmp.compare(heap[child],heap[child+1]) > 0) ++child;
if(heapCmp.compare(heap[index],heap[child]) <= 0) break;
swap(index,child);
index = child;
}
}

private void swap(int i, int j) {
WordInfo wi = heap[i], wj = heap[j];
indexMap.put(wi,j);
indexMap.put(wj,i);
heap[i] = wj;
heap[j] = wi;
}

static class WordInfo {

private String word;

private int cnt;

public WordInfo(String word) {
this.word = word;
cnt = 1;
}
}

public static void main(String[] args) {
KFrequentWords2 kfw = new KFrequentWords2(3);
kfw.add("hello");
kfw.add("yes");
kfw.add("C++");
System.out.println(kfw.getTopKWords());// yes,hello,C++
kfw.add("yes");
kfw.add("yes");
kfw.add("yes");
System.out.println(kfw.getTopKWords());// yes,hello,C++
kfw.add("C++");
kfw.add("C++");
kfw.add("C++");
System.out.println(kfw.getTopKWords());// yes,C++,hello
}
}

4. 测试程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import java.util.*;

public class KFrequentWordsTest {

public static void main(String[] args) {
int testTimes = 100_0000, maxWordCnt = 20,maxK = 5;
while(testTimes-- > 0) {// 测试testTimes次
Random r = new Random();
int k = r.nextInt(maxK) + 1;
int wc = r.nextInt(maxWordCnt) + 1;
String[] words = new String[wc];
for(int i = 0; i < wc; ++i) words[i] = getWord(r);// 生成wc个单词
KFrequentWords2 kfw2 = new KFrequentWords2(k);
KFrequentWords kfw = new KFrequentWords(k);
for(int i = 0; i < wc; ++i) {
kfw2.add(words[i]);
kfw.add(words[i]);
if(r.nextInt(2) == 0) {// 一半的概率获取topK
List<String> ans2 = kfw2.getTopKWords(), ans = kfw.getTopKWords();
if(ans2.size() != ans.size()) {
System.out.println("Oops!! ans2.size:" + ans2.size() + " ans.size:" + ans.size());
System.out.println("All words:");
for(int j = 0; j < wc; ++j) {
System.out.print(words[j]);
System.out.print(" ");
}
System.out.println();
System.out.println("ans2:");
for(int j = 0; j < ans2.size(); ++j) {
System.out.print(ans2.get(j));
System.out.print(" ");
}
System.out.println();
System.out.println("ans:");
for(int j = 0; j < ans.size(); ++j) {
System.out.print(ans.get(j));
System.out.print(" ");
}
System.out.println();
return;
}else {
int ansSize = ans.size();
for(int kk = 0; kk < ansSize; ++kk) {
if(!ans.get(kk).equals(ans2.get(kk))) {
System.out.println("Oops!! Not equals at:" + kk + "ans[" + kk +"]:" + ans.get(kk) + ", ans2[" + kk + "]:" + ans2.get(kk));

System.out.println("All words:");
for(int j = 0; j < wc; ++j) {
System.out.print(words[j]);
System.out.print(" ");
}
System.out.println();
System.out.println("ans2:");
for(int j = 0; j < ans2.size(); ++j) {
System.out.print(ans2.get(j));
System.out.print(" ");
}
System.out.println();
System.out.println("ans:");
for(int j = 0; j < ans.size(); ++j) {
System.out.print(ans.get(j));
System.out.print(" ");
}
System.out.println();
return;
}
}
}
}
}
}
System.out.println("Done successfully!");
}

private static String getWord(Random r) {
int len = r.nextInt(20) + 1;
StringBuilder sb = new StringBuilder();
for(int i = 0; i < len; ++i) sb.append((char)('a'+r.nextInt(26)));
return sb.toString();
}
}