-
Notifications
You must be signed in to change notification settings - Fork 5
/
generatepermutations.c
113 lines (101 loc) · 3.74 KB
/
generatepermutations.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include <stdio.h>
#include <stdlib.h>
#include "permutations.h"
// A very important part of the audiofingerprinting process is the MinHash algorithm
// used to shorten the many signatures generated for each audio input. This process
// relies on the use of multiple permutations that define a way of characterizing the
// data.
//
// In real-life applications, optimizing the way to choose these permutations can
// make a real difference performance-wise as explained in:
// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/34409.pdf
//
// However, at a smaller scale, just using random permutations is good enough. But
// since we are generating random permutations, we need to make sure that we generate the
// same permutations at index time and retrieval time otherwise we have no chance to ever
// match anything. Using a constant seed guarantees that the same permutations will be
// generated on all runs.
//
// However, this process is deterministic when running multiple times on the machine, but this
// may not be the case when using different machines. This is why this program is used to
// generate the content of the permutations.c file used to compile the main program, to guarantee
// that indexing and retrieval will use the same permutations.
//
// If you want to regenerate this file (maybe with your own seed), here is how to do it:
//
// $ make genperm
// $ ./genperm > permutations.c
// $ make
//
#define PERMUTATION_SEED 678233
static uint16_t permutations[N_PERMUTATIONS][PERMUTATION_LENGTH];
/**
* Shuffles the array with Knuth shuffle.
*
* @param data An array of size 8192
*/
static void shuffle(uint16_t* data) {
for (unsigned int i = 0 ; i < 8190 ; i++) {
// For each element #i, let's choose an element #j where j is between
// i and the last element of the array, then let's swap them
unsigned int j = i + rand() % (8192 - i);
uint16_t tmp = data[i];
data[i] = data[j];
data[j] = tmp;
}
}
/**
* Populates the given array with a permutation of the integers between 0 and 8191.
* @param temp An array large enough to hold 8192 values
*/
static void create_permutation(uint16_t* temp) {
// First, let's populate the array
for (unsigned int i = 0 ; i < 8192 ; i++) {
temp[i] = i;
}
// Then let's shuffle it
shuffle(temp);
}
/**
* Generates random permutations.
* @param seed The random generator seed so that we get deterministic results
*/
int main() {
uint16_t temp[8192];
srand(PERMUTATION_SEED);
for (unsigned int i = 0 ; i < N_PERMUTATIONS ; i++) {
create_permutation(temp);
for (unsigned int j = 0 ; j < PERMUTATION_LENGTH ; j++) {
permutations[i][j] = temp[j];
}
}
printf("/**\n");
printf(" * This file was generated by compiling and running generatepermutations.c\n");
printf(" */\n");
printf("\n");
printf("#include \"permutations.h\"\n");
printf("\n");
printf("static uint16_t permutations[N_PERMUTATIONS][PERMUTATION_LENGTH] = {\n");
for (unsigned int i = 0 ; i < N_PERMUTATIONS ; i++) {
printf(" {\n ");
for (unsigned int j = 0 ; j < PERMUTATION_LENGTH ; j++) {
printf("%4d", permutations[i][j]);
if (j == PERMUTATION_LENGTH - 1) {
printf("\n");
} else if (((j + 1) % 16) == 0) {
printf(",\n ");
} else {
printf(", ");
}
}
printf(" }%s\n", (i == N_PERMUTATIONS - 1) ? "" : ",");
}
printf("};\n");
printf("\n");
printf("\n");
printf("uint16_t* get_permutation(unsigned int n) {\n");
printf(" return &(permutations[n][0]);\n");
printf("}\n");
printf("\n");
return 0;
}