0%

External sort

Comparison sorts

Name Best Average Worst Memory Stable
Bubble sort \(n\) \(n^2\) \(n^2\) \(1\) Yes
Insertion sort \(n\) \(n^2\) \(n^2\) \(1\) Yes
Merge sort \(n\log n\) \(n\log n\) \(n\log n\) \(n\) Yes
Heapsort \(n\log n\) \(n\log n\) \(n\log n\) \(1\) No
Quicksort \(n\log n\) \(n\log n\) \(n^2\) \(\log n\) No

External sort

External sorting

external.cpp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#include <bits/stdc++.h>
#define DBG(x) cerr << #x << " = " << x << endl

using namespace std;
typedef long long LL;

const int MAX_MEM = 128 * 1024 * 1024; // 128MB
const int BUF_SIZE = 64;

string write_temp_file(int file_no, const vector<string> &strings) {
string temp_file = to_string(file_no) + ".txt";
FILE *f_temp = fopen(temp_file.c_str(), "w");

for (const auto &s : strings) {
fputs(s.c_str(), f_temp);
}
fclose(f_temp);

return temp_file;
}

vector<string> sort_part_to_disk(const string &input_file) {
vector<string> temp_files;

FILE *f_input = fopen(input_file.c_str(), "r");

int temp_size = 0;
int file_no = 0;
char buf[BUF_SIZE];
vector<string> strings;

while (fgets(buf, sizeof(buf) - 1, f_input) != NULL) {
string str = buf;
temp_size += str.size();
strings.push_back(str);

if (temp_size >= MAX_MEM) {
sort(strings.begin(), strings.end());
string temp_file = write_temp_file(file_no, strings);
temp_files.push_back(temp_file);

file_no++;
strings.clear();
temp_size = 0;
}
}

if (!strings.empty()) {
sort(strings.begin(), strings.end());
string temp_file = write_temp_file(file_no, strings);
temp_files.push_back(temp_file);
}

fclose(f_input);

return temp_files;
}

void external_sort(const string &input_file, const string &output_file) {
vector<string> temp_files = sort_part_to_disk(input_file);
vector<FILE *> files;
for (const auto &file_name : temp_files) {
FILE *temp_file = fopen(file_name.c_str(), "r");
files.push_back(temp_file);
}

FILE *f_output = fopen(output_file.c_str(), "w");

priority_queue<pair<string, string>, vector<pair<string, string>>, greater<pair<string, string>>> pri_que;

bool have_input = false;
do {
have_input = false;

for (int i = 0; i < temp_files.size(); i++) {
char buf[BUF_SIZE];
if (fgets(buf, sizeof(buf) - 1, files[i]) != NULL) {
have_input = true;
pri_que.push(pair<string, string>(buf, temp_files[i]));
}
}

if (!pri_que.empty()) {
pair<string, string> top = pri_que.top();
pri_que.pop();
fputs(top.first.c_str(), f_output);
}

} while (have_input || !pri_que.empty());

for (FILE *f : files) {
fclose(f);
}
fclose(f_output);
}

int main(int argc, char **argv) {
external_sort("raw.txt", "output.txt");
return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
 torapture@localhost ~/SSD/Codes/acm ls -lh raw.txt
-rwxrwxrwx 1 torapture staff 685M 3 30 18:02 raw.txt

torapture@localhost ~/SSD/Codes/acm time ./external
./external 617.04s user 17.50s system 94% cpu 11:10.99 total

torapture@localhost ~/SSD/Codes/acm time sort ./raw.txt > sort.txt
sort ./raw.txt > sort.txt 496.36s user 87.25s system 81% cpu 11:58.01 total

torapture@localhost ~/SSD/Codes/acm ls -lh raw.txt output.txt sort.txt
-rwxrwxrwx 1 torapture staff 685M 3 30 18:13 output.txt
-rwxrwxrwx 1 torapture staff 685M 3 30 18:02 raw.txt
-rwxrwxrwx 1 torapture staff 685M 3 30 18:28 sort.txt
torapture@localhost ~/SSD/Codes/acm diff output.txt sort.txt