FastDeploy  latest
Fast & Easy to Deploy!
float16.h
1 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #pragma once
16 
17 #include <stdint.h>
18 
19 #include <cmath>
20 #include <iostream>
21 #include <limits>
22 
23 #if !defined(_WIN32)
24 #define FD_ALIGN(x) __attribute__((aligned(x)))
25 #else
26 #define FD_ALIGN(x) __declspec(align(x))
27 #endif
28 
29 namespace fastdeploy {
30 
31 struct FD_ALIGN(2) float16 {
32  public:
33  uint16_t x;
34 
35  // The following defaulted special class member functions
36  // are added to make float16 pass the std::is_trivial test
37  float16() = default;
38  float16(const float16& o) = default;
39  float16& operator=(const float16& o) = default;
40  float16(float16&& o) = default;
41  float16& operator=(float16&& o) = default;
42  ~float16() = default;
43 
44 // Constructors
45 
46 #ifdef FD_WITH_NATIVE_FP16
47  // __fp16 is a native half precision data type for arm cpu,
48  // float16_t is an alias for __fp16
49  inline explicit float16(const float16_t& h) {
50  x = *reinterpret_cast<const uint16_t*>(&h);
51  }
52 #endif
53 
54  inline explicit float16(float val) {
55 #if defined(FD_WITH_NATIVE_FP16)
56  float32x4_t tmp = vld1q_dup_f32(&val);
57  float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
58  x = *reinterpret_cast<uint16_t*>(&res);
59 
60 #elif defined(__F16C__)
61  x = _cvtss_sh(val, 0);
62 
63 #else
64  // Conversion routine adapted from
65  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
66  Bits v, s;
67  v.f = val;
68  uint32_t sign = v.si & sigN;
69  v.si ^= sign;
70  sign >>= shiftSign; // logical shift
71  s.si = mulN;
72  s.si = s.f * v.f; // correct subnormals
73  v.si ^= (s.si ^ v.si) & -(minN > v.si);
74  v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
75  v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
76  v.ui >>= shift; // logical shift
77  v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
78  v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
79  x = v.ui | sign;
80 
81 #endif
82  }
83 
84  inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
85 
86  template <class T>
87  inline explicit float16(const T& val)
88  : x(float16(static_cast<float>(val)).x) {}
89 
90 // Assignment operators
91 
92 #ifdef FD_WITH_NATIVE_FP16
93  inline float16& operator=(const float16_t& rhs) {
94  x = *reinterpret_cast<const uint16_t*>(&rhs);
95  return *this;
96  }
97 #endif
98 
99  inline float16& operator=(bool b) {
100  x = b ? 0x3c00 : 0;
101  return *this;
102  }
103 
104  inline float16& operator=(int8_t val) {
105  x = float16(val).x;
106  return *this;
107  }
108 
109  inline float16& operator=(uint8_t val) {
110  x = float16(val).x;
111  return *this;
112  }
113 
114  inline float16& operator=(int16_t val) {
115  x = float16(val).x;
116  return *this;
117  }
118 
119  inline float16& operator=(uint16_t val) {
120  x = float16(val).x;
121  return *this;
122  }
123 
124  inline float16& operator=(int32_t val) {
125  x = float16(val).x;
126  return *this;
127  }
128 
129  inline float16& operator=(uint32_t val) {
130  x = float16(val).x;
131  return *this;
132  }
133 
134  inline float16& operator=(int64_t val) {
135  x = float16(val).x;
136  return *this;
137  }
138 
139  inline float16& operator=(uint64_t val) {
140  x = float16(val).x;
141  return *this;
142  }
143 
144  inline float16& operator=(float val) {
145  x = float16(val).x;
146  return *this;
147  }
148 
149  inline float16& operator=(double val) {
150  x = float16(val).x;
151  return *this;
152  }
153 
154 // Conversion opertors
155 #ifdef FD_WITH_NATIVE_FP16
156  HOSTDEVICE inline explicit operator float16_t() const {
157  return *reinterpret_cast<const float16_t*>(this);
158  }
159 #endif
160 
161  inline operator float() const {
162 #if defined(FD_WITH_NATIVE_FP16)
163  float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
164  return vgetq_lane_f32(vcvt_f32_f16(res), 0);
165 
166 #elif defined(__F16C__)
167  return _cvtsh_ss(this->x);
168 
169 #else
170  // Conversion routine adapted from
171  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
172  Bits v;
173  v.ui = this->x;
174  int32_t sign = v.si & sigC;
175  v.si ^= sign;
176  sign <<= shiftSign;
177  v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
178  v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
179  Bits s;
180  s.si = mulC;
181  s.f *= v.si;
182  int32_t mask = -(norC > v.si);
183  v.si <<= shift;
184  v.si ^= (s.si ^ v.si) & mask;
185  v.si |= sign;
186  return v.f;
187 
188 #endif
189  }
190 
191  inline explicit operator bool() const { return (x & 0x7fff) != 0; }
192 
193  inline explicit operator int8_t() const {
194  return static_cast<int8_t>(static_cast<float>(*this));
195  }
196 
197  inline explicit operator uint8_t() const {
198  return static_cast<uint8_t>(static_cast<float>(*this));
199  }
200 
201  inline explicit operator int16_t() const {
202  return static_cast<int16_t>(static_cast<float>(*this));
203  }
204 
205  inline explicit operator uint16_t() const {
206  return static_cast<uint16_t>(static_cast<float>(*this));
207  }
208 
209  inline explicit operator int32_t() const {
210  return static_cast<int32_t>(static_cast<float>(*this));
211  }
212 
213  inline explicit operator uint32_t() const {
214  return static_cast<uint32_t>(static_cast<float>(*this));
215  }
216 
217  inline explicit operator int64_t() const {
218  return static_cast<int64_t>(static_cast<float>(*this));
219  }
220 
221  inline explicit operator uint64_t() const {
222  return static_cast<uint64_t>(static_cast<float>(*this));
223  }
224 
225  inline operator double() const {
226  return static_cast<double>(static_cast<float>(*this));
227  }
228 
229  inline bool operator>(const float& other) const {
230  return this->operator float() > other;
231  }
232 
233  inline bool operator>(const double& other) const {
234  return this->operator double() > other;
235  }
236 
237  inline bool operator<(const float& other) const {
238  return this->operator float() > other;
239  }
240 
241  inline bool operator<(const double& other) const {
242  return this->operator double() > other;
243  }
244 
245  template <typename T,
246  typename std::enable_if<!std::is_same<T, float16>::value,
247  bool>::type = true>
248  inline float16& operator+=(const T& other) {
249  *this = float16(static_cast<T>(*this) + other);
250  return *this;
251  }
252 
253  private:
254  union Bits {
255  float f;
256  int32_t si;
257  uint32_t ui;
258  };
259 
260  static const int shift = 13;
261  static const int shiftSign = 16;
262 
263  static const int32_t infN = 0x7F800000;
264  static const int32_t maxN = 0x477FE000; // max flt16 as flt32
265  static const int32_t minN = 0x38800000; // min flt16 normal as flt32
266  static const int32_t sigN = 0x80000000; // sign bit
267 
268  static constexpr int32_t infC = infN >> shift;
269  static constexpr int32_t nanN = (infC + 1)
270  << shift; // minimum flt16 nan as float32
271  static constexpr int32_t maxC = maxN >> shift;
272  static constexpr int32_t minC = minN >> shift;
273  static constexpr int32_t sigC = sigN >> shiftSign;
274 
275  static const int32_t mulN = 0x52000000; // (1 << 23) / minN
276  static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift))
277  static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted
278  static const int32_t norC = 0x00400; // min flt32 normal downshifted
279 
280  static constexpr int32_t maxD = infC - maxC - 1;
281  static constexpr int32_t minD = minC - subC - 1;
282 };
283 
284 // Arithmetic operators for float16 on ARMv8.2-A CPU
285 #if defined(FD_WITH_NATIVE_FP16)
286 inline float16 operator+(const float16& a, const float16& b) {
287  float16 res;
288  asm volatile(
289  "ld1 {v0.h}[0], [%[a_ptr]]\n"
290  "ld1 {v1.h}[0], [%[b_ptr]]\n"
291  "fadd h0, h0, h1\n"
292  "st1 {v0.h}[0], [%[res_ptr]]\n"
293  : // outputs
294  : // inputs
295  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
296  [res_ptr] "r"(&(res.x))
297  : // clobbers
298  "memory", "v0", "v1");
299  return res;
300 }
301 
302 inline float16 operator-(const float16& a, const float16& b) {
303  float16 res;
304  asm volatile(
305  "ld1 {v0.h}[0], [%[a_ptr]]\n"
306  "ld1 {v1.h}[0], [%[b_ptr]]\n"
307  "fsub h0, h0, h1\n"
308  "st1 {v0.h}[0], [%[res_ptr]]\n"
309  : // outputs
310  : // inputs
311  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
312  [res_ptr] "r"(&(res.x))
313  : // clobbers
314  "memory", "v0", "v1");
315  return res;
316 }
317 
318 inline float16 operator*(const float16& a, const float16& b) {
319  float16 res;
320  asm volatile(
321  "ld1 {v0.h}[0], [%[a_ptr]]\n"
322  "ld1 {v1.h}[0], [%[b_ptr]]\n"
323  "fmul h0, h0, h1\n"
324  "st1 {v0.h}[0], [%[res_ptr]]\n"
325  : // outputs
326  : // inputs
327  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
328  [res_ptr] "r"(&(res.x))
329  : // clobbers
330  "memory", "v0", "v1");
331  return res;
332 }
333 
334 inline float16 operator/(const float16& a, const float16& b) {
335  float16 res;
336  asm volatile(
337  "ld1 {v0.h}[0], [%[a_ptr]]\n"
338  "ld1 {v1.h}[0], [%[b_ptr]]\n"
339  "fdiv h0, h0, h1\n"
340  "st1 {v0.h}[0], [%[res_ptr]]\n"
341  : // outputs
342  : // inputs
343  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
344  [res_ptr] "r"(&(res.x))
345  : // clobbers
346  "memory", "v0", "v1");
347  return res;
348 }
349 
350 inline float16 operator-(const float16& a) {
351  float16 res;
352  asm volatile(
353  "ld1 {v0.h}[0], [%[a_ptr]]\n"
354  "fneg h0, h0\n"
355  "st1 {v0.h}[0], [%[res_ptr]]\n"
356  : // outputs
357  : // inputs
358  [a_ptr] "r"(&(a.x)),
359  [res_ptr] "r"(&(res.x))
360  : // clobbers
361  "memory", "v0");
362  return res;
363 }
364 
365 inline float16& operator+=(float16& a, const float16& b) { // NOLINT
366  a = a + b;
367  return a;
368 }
369 
370 inline float16& operator-=(float16& a, const float16& b) { // NOLINT
371  a = a - b;
372  return a;
373 }
374 
375 inline float16& operator*=(float16& a, const float16& b) { // NOLINT
376  a = a * b;
377  return a;
378 }
379 
380 inline float16& operator/=(float16& a, const float16& b) { // NOLINT
381  a = a / b;
382  return a;
383 }
384 
385 inline bool operator==(const float16& a, const float16& b) {
386  uint16_t res;
387  asm volatile(
388  "ld1 {v0.h}[0], [%[a_ptr]]\n"
389  "ld1 {v1.h}[0], [%[b_ptr]]\n"
390  "fcmeq h0, h0, h1\n"
391  "st1 {v0.h}[0], [%[res_ptr]]\n"
392  : // outputs
393  : // inputs
394  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
395  [res_ptr] "r"(&res)
396  : // clobbers
397  "memory", "v0", "v1");
398  return (res & 0xffff) != 0;
399 }
400 
401 inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
402 
403 inline bool operator<(const float16& a, const float16& b) {
404  uint16_t res;
405  asm volatile(
406  "ld1 {v1.h}[0], [%[a_ptr]]\n"
407  "ld1 {v0.h}[0], [%[b_ptr]]\n"
408  "fcmgt h0, h0, h1\n"
409  "st1 {v0.h}[0], [%[res_ptr]]\n"
410  : // outputs
411  : // inputs
412  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
413  [res_ptr] "r"(&res)
414  : // clobbers
415  "memory", "v0", "v1");
416  return (res & 0xffff) != 0;
417 }
418 
419 inline bool operator<=(const float16& a, const float16& b) {
420  uint16_t res;
421  asm volatile(
422  "ld1 {v1.h}[0], [%[a_ptr]]\n"
423  "ld1 {v0.h}[0], [%[b_ptr]]\n"
424  "fcmge h0, h0, h1\n"
425  "st1 {v0.h}[0], [%[res_ptr]]\n"
426  : // outputs
427  : // inputs
428  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
429  [res_ptr] "r"(&res)
430  : // clobbers
431  "memory", "v0", "v1");
432  return (res & 0xffff) != 0;
433 }
434 
435 inline bool operator>(const float16& a, const float16& b) {
436  uint16_t res;
437  asm volatile(
438  "ld1 {v0.h}[0], [%[a_ptr]]\n"
439  "ld1 {v1.h}[0], [%[b_ptr]]\n"
440  "fcmgt h0, h0, h1\n"
441  "st1 {v0.h}[0], [%[res_ptr]]\n"
442  : // outputs
443  : // inputs
444  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
445  [res_ptr] "r"(&res)
446  : // clobbers
447  "memory", "v0", "v1");
448  return (res & 0xffff) != 0;
449 }
450 
451 inline bool operator>=(const float16& a, const float16& b) {
452  uint16_t res;
453  asm volatile(
454  "ld1 {v0.h}[0], [%[a_ptr]]\n"
455  "ld1 {v1.h}[0], [%[b_ptr]]\n"
456  "fcmge h0, h0, h1\n"
457  "st1 {v0.h}[0], [%[res_ptr]]\n"
458  : // outputs
459  : // inputs
460  [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
461  [res_ptr] "r"(&res)
462  : // clobbers
463  "memory", "v0", "v1");
464  return (res & 0xffff) != 0;
465 #else
466 inline float16 operator+(const float16& a, const float16& b) {
467  return float16(static_cast<float>(a) + static_cast<float>(b));
468 }
469 
470 inline float16 operator-(const float16& a, const float16& b) {
471  return float16(static_cast<float>(a) - static_cast<float>(b));
472 }
473 
474 inline float16 operator*(const float16& a, const float16& b) {
475  return float16(static_cast<float>(a) * static_cast<float>(b));
476 }
477 
478 inline float16 operator/(const float16& a, const float16& b) {
479  return float16(static_cast<float>(a) / static_cast<float>(b));
480 }
481 
482 inline float16 operator-(const float16& a) {
483  float16 res;
484  res.x = a.x ^ 0x8000;
485  return res;
486 }
487 
488 inline float16& operator+=(float16& a, const float16& b) { // NOLINT
489  a = float16(static_cast<float>(a) + static_cast<float>(b));
490  return a;
491 }
492 
493 inline float16& operator-=(float16& a, const float16& b) { // NOLINT
494  a = float16(static_cast<float>(a) - static_cast<float>(b));
495  return a;
496 }
497 
498 inline float16& operator*=(float16& a, const float16& b) { // NOLINT
499  a = float16(static_cast<float>(a) * static_cast<float>(b));
500  return a;
501 }
502 
503 inline float16& operator/=(float16& a, const float16& b) { // NOLINT
504  a = float16(static_cast<float>(a) / static_cast<float>(b));
505  return a;
506 }
507 
508 inline bool operator==(const float16& a, const float16& b) {
509  return static_cast<float>(a) == static_cast<float>(b);
510 }
511 
512 inline bool operator!=(const float16& a, const float16& b) {
513  return static_cast<float>(a) != static_cast<float>(b);
514 }
515 
516 inline bool operator<(const float16& a, const float16& b) {
517  return static_cast<float>(a) < static_cast<float>(b);
518 }
519 
520 inline bool operator<=(const float16& a, const float16& b) {
521  return static_cast<float>(a) <= static_cast<float>(b);
522 }
523 
524 inline bool operator>(const float16& a, const float16& b) {
525  return static_cast<float>(a) > static_cast<float>(b);
526 }
527 
528 inline bool operator>=(const float16& a, const float16& b) {
529  return static_cast<float>(a) >= static_cast<float>(b);
530 }
531 #endif
532 
533  template <typename T,
534  typename std::enable_if<std::is_integral<T>::value ||
535  std::is_same<T, float>::value,
536  bool>::type = true>
537  inline T& operator+=(T& a, const float16& b) { // NOLINT
538  auto c = static_cast<float>(a) + static_cast<float>(b);
539  a = static_cast<T>(c);
540  return a;
541  }
542 
543  inline double& operator+=(double& a, const float16& b) { // NOLINT
544  a = a + static_cast<double>(b);
545  return a;
546  }
547 
548  inline float16 raw_uint16_to_float16(uint16_t a) {
549  float16 res;
550  res.x = a;
551  return res;
552  }
553 
554  inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
555 
556  inline bool(isinf)(const float16& a) { return (a.x & 0x7fff) == 0x7c00; }
557 
558  inline bool(isfinite)(const float16& a) {
559  return !((isnan)(a)) && !((isinf)(a));
560  }
561 
562  inline float16(abs)(const float16& a) {
563  return float16(std::abs(static_cast<float>(a)));
564  }
565 
566  inline std::ostream& operator<<(std::ostream& os, const float16& a) {
567  os << static_cast<float>(a);
568  return os;
569  }
570 } // namespace fastdeploy
571 
572 namespace std {
573 
574 // Override the std::is_pod::value for float16
575 // The reason is that different compilers implemented std::is_pod based on
576 // different C++ standards. float16 class is a plain old data in C++11 given
577 // that it is both trivial and standard_layout.
578 // However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
579 // more restricted in that you cannot provide any customized
580 // constructor in float16. Hence, we override is_pod here following C++11
581 // so that .cu files can be successfully compiled by nvcc.
582 template <>
583 struct is_pod<fastdeploy::float16> {
584  static const bool value = is_trivial<fastdeploy::float16>::value &&
585  is_standard_layout<fastdeploy::float16>::value;
586 };
587 
588 template <>
589 struct is_floating_point<fastdeploy::float16>
590  : std::integral_constant<
591  bool, std::is_same<fastdeploy::float16,
592  typename std::remove_cv<
593  fastdeploy::float16>::type>::value> {};
594 template <>
595 struct is_signed<fastdeploy::float16> {
596  static const bool value = true;
597 };
598 
599 template <>
600 struct is_unsigned<fastdeploy::float16> {
601  static const bool value = false;
602 };
603 
604 inline bool isnan(const fastdeploy::float16& a) { return fastdeploy::isnan(a); }
605 
606 inline bool isinf(const fastdeploy::float16& a) { return fastdeploy::isinf(a); }
607 
608 template <>
609 struct numeric_limits<fastdeploy::float16> {
610  static const bool is_specialized = true;
611  static const bool is_signed = true;
612  static const bool is_integer = false;
613  static const bool is_exact = false;
614  static const bool has_infinity = true;
615  static const bool has_quiet_NaN = true;
616  static const bool has_signaling_NaN = true;
617  static const float_denorm_style has_denorm = denorm_present;
618  static const bool has_denorm_loss = false;
619  static const std::float_round_style round_style = std::round_to_nearest;
620  static const bool is_iec559 = false;
621  static const bool is_bounded = false;
622  static const bool is_modulo = false;
623  static const int digits = 11;
624  static const int digits10 = 3;
625  static const int max_digits10 = 5;
626  static const int radix = 2;
627  static const int min_exponent = -13;
628  static const int min_exponent10 = -4;
629  static const int max_exponent = 16;
630  static const int max_exponent10 = 4;
631  static const bool traps = true;
632  static const bool tinyness_before = false;
633 
634  static fastdeploy::float16(min)() {
635  return fastdeploy::raw_uint16_to_float16(0x400);
636  }
637  static fastdeploy::float16 lowest() {
638  return fastdeploy::raw_uint16_to_float16(0xfbff);
639  }
640  static fastdeploy::float16(max)() {
641  return fastdeploy::raw_uint16_to_float16(0x7bff);
642  }
643  static fastdeploy::float16 epsilon() {
644  return fastdeploy::raw_uint16_to_float16(0x0800);
645  }
646  static fastdeploy::float16 round_error() { return fastdeploy::float16(0.5); }
647  static fastdeploy::float16 infinity() {
648  return fastdeploy::raw_uint16_to_float16(0x7c00);
649  }
650  static fastdeploy::float16 quiet_NaN() {
651  return fastdeploy::raw_uint16_to_float16(0x7e00);
652  }
653  static fastdeploy::float16 signaling_NaN() {
654  return fastdeploy::raw_uint16_to_float16(0x7e00);
655  }
656  static fastdeploy::float16 denorm_min() {
657  return fastdeploy::raw_uint16_to_float16(0x1);
658  }
659 };
660 
661 inline fastdeploy::float16 abs(const fastdeploy::float16& a) {
662  return fastdeploy::abs(a);
663 }
664 
665 } // namespace std
Definition: float16.h:572
All C++ FastDeploy APIs are defined inside this namespace.
Definition: option.h:16