]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, software | |
12 | // distributed under the License is distributed on an "AS IS" BASIS, | |
13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | // See the License for the specific language governing permissions and | |
15 | // limitations under the License. | |
16 | ||
17 | package hashing | |
18 | ||
19 | import ( | |
20 | "github.com/apache/arrow/go/v6/arrow/bitutil" | |
21 | "github.com/apache/arrow/go/v6/parquet/internal/utils" | |
22 | ) | |
23 | ||
24 | {{range .In}} | |
25 | type payload{{.Name}} struct { | |
26 | val {{.name}} | |
27 | memoIdx int32 | |
28 | } | |
29 | ||
30 | type entry{{.Name}} struct { | |
31 | h uint64 | |
32 | payload payload{{.Name}} | |
33 | } | |
34 | ||
35 | func (e entry{{.Name}}) Valid() bool { return e.h != sentinel } | |
36 | ||
37 | // {{.Name}}HashTable is a hashtable specifically for {{.name}} that | |
38 | // is utilized with the MemoTable to generalize interactions for easier | |
39 | // implementation of dictionaries without losing performance. | |
40 | type {{.Name}}HashTable struct { | |
41 | cap uint64 | |
42 | capMask uint64 | |
43 | size uint64 | |
44 | ||
45 | entries []entry{{.Name}} | |
46 | } | |
47 | ||
48 | // New{{.Name}}HashTable returns a new hash table for {{.name}} values | |
49 | // initialized with the passed in capacity or 32 whichever is larger. | |
50 | func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable { | |
51 | initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) | |
52 | ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0} | |
53 | ret.entries = make([]entry{{.Name}}, initCap) | |
54 | return ret | |
55 | } | |
56 | ||
57 | // Reset drops all of the values in this hash table and re-initializes it | |
58 | // with the specified initial capacity as if by calling New, but without having | |
59 | // to reallocate the object. | |
60 | func (h *{{.Name}}HashTable) Reset(cap uint64) { | |
61 | h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) | |
62 | h.capMask = h.cap - 1 | |
63 | h.size = 0 | |
64 | h.entries = make([]entry{{.Name}}, h.cap) | |
65 | } | |
66 | ||
67 | // CopyValues is used for copying the values out of the hash table into the | |
68 | // passed in slice, in the order that they were first inserted | |
69 | func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) { | |
70 | h.CopyValuesSubset(0, out) | |
71 | } | |
72 | ||
73 | // CopyValuesSubset copies a subset of the values in the hashtable out, starting | |
74 | // with the value at start, in the order that they were inserted. | |
75 | func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) { | |
76 | h.VisitEntries(func(e *entry{{.Name}}) { | |
77 | idx := e.payload.memoIdx - int32(start) | |
78 | if idx >= 0 { | |
79 | out[idx] = e.payload.val | |
80 | } | |
81 | }) | |
82 | } | |
83 | ||
84 | func (h *{{.Name}}HashTable) WriteOut(out []byte) { | |
85 | h.WriteOutSubset(0, out) | |
86 | } | |
87 | ||
88 | func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) { | |
89 | data := arrow.{{.Name}}Traits.CastFromBytes(out) | |
90 | h.VisitEntries(func(e *entry{{.Name}}) { | |
91 | idx := e.payload.memoIdx - int32(start) | |
92 | if idx >= 0 { | |
93 | data[idx] = utils.ToLE{{.Name}}(e.payload.val) | |
94 | } | |
95 | }) | |
96 | } | |
97 | ||
98 | func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } | |
99 | ||
100 | func ({{.Name}}HashTable) fixHash(v uint64) uint64 { | |
101 | if v == sentinel { | |
102 | return 42 | |
103 | } | |
104 | return v | |
105 | } | |
106 | ||
107 | // Lookup retrieves the entry for a given hash value assuming it's payload value returns | |
108 | // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, | |
109 | // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. | |
110 | func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) { | |
111 | idx, ok := h.lookup(v, h.capMask, cmp) | |
112 | return &h.entries[idx], ok | |
113 | } | |
114 | ||
115 | func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) { | |
116 | const perturbShift uint8 = 5 | |
117 | ||
118 | var ( | |
119 | idx uint64 | |
120 | perturb uint64 | |
121 | e *entry{{.Name}} | |
122 | ) | |
123 | ||
124 | v = h.fixHash(v) | |
125 | idx = v & szMask | |
126 | perturb = (v >> uint64(perturbShift)) + 1 | |
127 | ||
128 | for { | |
129 | e = &h.entries[idx] | |
130 | if e.h == v && cmp(e.payload.val) { | |
131 | return idx, true | |
132 | } | |
133 | ||
134 | if e.h == sentinel { | |
135 | return idx, false | |
136 | } | |
137 | ||
138 | // perturbation logic inspired from CPython's set/dict object | |
139 | // the goal is that all 64 bits of unmasked hash value eventually | |
140 | // participate int he probing sequence, to minimize clustering | |
141 | idx = (idx + perturb) & szMask | |
142 | perturb = (perturb >> uint64(perturbShift)) + 1 | |
143 | } | |
144 | } | |
145 | ||
146 | func (h *{{.Name}}HashTable) upsize(newcap uint64) error { | |
147 | newMask := newcap - 1 | |
148 | ||
149 | oldEntries := h.entries | |
150 | h.entries = make([]entry{{.Name}}, newcap) | |
151 | for _, e := range oldEntries { | |
152 | if e.Valid() { | |
153 | idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false }) | |
154 | h.entries[idx] = e | |
155 | } | |
156 | } | |
157 | h.cap = newcap | |
158 | h.capMask = newMask | |
159 | return nil | |
160 | } | |
161 | ||
162 | // Insert updates the given entry with the provided hash value, payload value and memo index. | |
163 | // The entry pointer must have been retrieved via lookup in order to actually insert properly. | |
164 | func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error { | |
165 | e.h = h.fixHash(v) | |
166 | e.payload.val = val | |
167 | e.payload.memoIdx = memoIdx | |
168 | h.size++ | |
169 | ||
170 | if h.needUpsize() { | |
171 | h.upsize(h.cap * uint64(loadFactor) * 2) | |
172 | } | |
173 | return nil | |
174 | } | |
175 | ||
176 | // VisitEntries will call the passed in function on each *valid* entry in the hash table, | |
177 | // a valid entry being one which has had a value inserted into it. | |
178 | func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) { | |
179 | for _, e := range h.entries { | |
180 | if e.Valid() { | |
181 | visit(&e) | |
182 | } | |
183 | } | |
184 | } | |
185 | ||
186 | // {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface | |
187 | // conforming to the MemoTable interface defined in the encoding package for general interactions | |
188 | // regarding dictionaries. | |
189 | type {{.Name}}MemoTable struct { | |
190 | tbl *{{.Name}}HashTable | |
191 | nullIdx int32 | |
192 | } | |
193 | ||
194 | // New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further | |
195 | // allocations when inserting. | |
196 | func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable { | |
197 | return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound} | |
198 | } | |
199 | ||
200 | // Reset allows this table to be re-used by dumping all the data currently in the table. | |
201 | func (s *{{.Name}}MemoTable) Reset() { | |
202 | s.tbl.Reset(32) | |
203 | s.nullIdx = KeyNotFound | |
204 | } | |
205 | ||
206 | // Size returns the current number of inserted elements into the table including if a null | |
207 | // has been inserted. | |
208 | func (s *{{.Name}}MemoTable) Size() int { | |
209 | sz := int(s.tbl.size) | |
210 | if _, ok := s.GetNull(); ok { | |
211 | sz++ | |
212 | } | |
213 | return sz | |
214 | } | |
215 | ||
216 | // GetNull returns the index of an inserted null or KeyNotFound along with a bool | |
217 | // that will be true if found and false if not. | |
218 | func (s *{{.Name}}MemoTable) GetNull() (int, bool) { | |
219 | return int(s.nullIdx), s.nullIdx != KeyNotFound | |
220 | } | |
221 | ||
222 | // GetOrInsertNull will return the index of the null entry or insert a null entry | |
223 | // if one currently doesn't exist. The found value will be true if there was already | |
224 | // a null in the table, and false if it inserted one. | |
225 | func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) { | |
226 | idx, found = s.GetNull() | |
227 | if !found { | |
228 | idx = s.Size() | |
229 | s.nullIdx = int32(idx) | |
230 | } | |
231 | return | |
232 | } | |
233 | ||
234 | // CopyValues will copy the values from the memo table out into the passed in slice | |
235 | // which must be of the appropriate type. | |
236 | func (s *{{.Name}}MemoTable) CopyValues(out interface{}) { | |
237 | s.CopyValuesSubset(0, out) | |
238 | } | |
239 | ||
240 | // CopyValuesSubset is like CopyValues but only copies a subset of values starting | |
241 | // at the provided start index | |
242 | func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) { | |
243 | s.tbl.CopyValuesSubset(start, out.([]{{.name}})) | |
244 | } | |
245 | ||
246 | func (s *{{.Name}}MemoTable) WriteOut(out []byte) { | |
247 | s.tbl.WriteOut(out) | |
248 | } | |
249 | ||
250 | func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) { | |
251 | s.tbl.WriteOutSubset(start, out) | |
252 | } | |
253 | ||
254 | // Get returns the index of the requested value in the hash table or KeyNotFound | |
255 | // along with a boolean indicating if it was found or not. | |
256 | func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) { | |
257 | {{if or (eq .Name "Int32") (eq .Name "Int64") }} | |
258 | h := hashInt(uint64(val.({{.name}})), 0) | |
259 | if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok { | |
260 | {{ else -}} | |
261 | var cmp func({{.name}}) bool | |
262 | {{if eq .Name "Float32"}} | |
263 | if math.IsNaN(float64(val.(float32))) { | |
264 | cmp = isNan32Cmp | |
265 | // use consistent internal bit pattern for NaN regardless of the pattern | |
266 | // that is passed to us. NaN is NaN is NaN | |
267 | val = float32(math.NaN()) | |
268 | {{ else -}} | |
269 | if math.IsNaN(val.(float64)) { | |
270 | cmp = math.IsNaN | |
271 | // use consistent internal bit pattern for NaN regardless of the pattern | |
272 | // that is passed to us. NaN is NaN is NaN | |
273 | val = math.NaN() | |
274 | {{end -}} | |
275 | } else { | |
276 | cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } | |
277 | } | |
278 | ||
279 | h := hash{{.Name}}(val.({{.name}}), 0) | |
280 | if e, ok := s.tbl.Lookup(h, cmp); ok { | |
281 | {{ end -}} | |
282 | return int(e.payload.memoIdx), ok | |
283 | } | |
284 | return KeyNotFound, false | |
285 | } | |
286 | ||
287 | // GetOrInsert will return the index of the specified value in the table, or insert the | |
288 | // value into the table and return the new index. found indicates whether or not it already | |
289 | // existed in the table (true) or was inserted by this call (false). | |
290 | func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { | |
291 | {{if or (eq .Name "Int32") (eq .Name "Int64") }} | |
292 | h := hashInt(uint64(val.({{.name}})), 0) | |
293 | e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { | |
294 | return val.({{.name}}) == v | |
295 | }) | |
296 | {{ else }} | |
297 | var cmp func({{.name}}) bool | |
298 | {{if eq .Name "Float32"}} | |
299 | if math.IsNaN(float64(val.(float32))) { | |
300 | cmp = isNan32Cmp | |
301 | // use consistent internal bit pattern for NaN regardless of the pattern | |
302 | // that is passed to us. NaN is NaN is NaN | |
303 | val = float32(math.NaN()) | |
304 | {{ else -}} | |
305 | if math.IsNaN(val.(float64)) { | |
306 | cmp = math.IsNaN | |
307 | // use consistent internal bit pattern for NaN regardless of the pattern | |
308 | // that is passed to us. NaN is NaN is NaN | |
309 | val = math.NaN() | |
310 | {{end -}} | |
311 | } else { | |
312 | cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } | |
313 | } | |
314 | ||
315 | h := hash{{.Name}}(val.({{.name}}), 0) | |
316 | e, ok := s.tbl.Lookup(h, cmp) | |
317 | {{ end }} | |
318 | if ok { | |
319 | idx = int(e.payload.memoIdx) | |
320 | found = true | |
321 | } else { | |
322 | idx = s.Size() | |
323 | s.tbl.Insert(e, h, val.({{.name}}), int32(idx)) | |
324 | } | |
325 | return | |
326 | } | |
327 | {{end}} |