aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/base32.c93
-rw-r--r--src/base32.h10
-rw-r--r--src/common.h27
-rw-r--r--src/hmac.c41
-rw-r--r--src/hmac.h11
-rw-r--r--src/main.c191
-rw-r--r--src/sha1-generic.c78
-rw-r--r--src/sha1-x64.c98
-rw-r--r--src/sha1.c76
-rw-r--r--src/sha1.h21
-rw-r--r--src/xendian.h21
11 files changed, 667 insertions, 0 deletions
diff --git a/src/base32.c b/src/base32.c
new file mode 100644
index 0000000..82c5b48
--- /dev/null
+++ b/src/base32.c
@@ -0,0 +1,93 @@
+#include <assert.h>
+
+#include "base32.h"
+#include "common.h"
+
+static inline bool b32blktoa(uint8_t *restrict, const uint8_t *restrict)
+ __attribute__((always_inline));
+
+static const uint8_t lookup[] = {
+ /* [00…07] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [08…0F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [10…17] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [18…1F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [20…27] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [28…2F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [30…37] = */ 0xFF, 0xFF, 26, 27, 28, 29, 30, 31,
+ /* [38…3F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF,
+ /* [40…47] = */ 0xFF, 0, 1, 2, 3, 4, 5, 6,
+ /* [48…4F] = */ 7, 8, 9, 10, 11, 12, 13, 14,
+ /* [50…57] = */ 15, 16, 17, 18, 19, 20, 21, 22,
+ /* [58…5F] = */ 23, 24, 25, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [60…67] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [68…6F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [70…77] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [78…7F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [80…87] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [88…8F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [90…97] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [98…9F] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [A0…A7] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [A8…AF] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [B0…B7] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [B8…BF] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [C0…C7] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [C8…CF] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [D0…D7] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [D8…DF] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [E0…E7] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [E8…EF] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [F0…F7] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ /* [F8…FF] = */ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+};
+
+bool
+b32toa(uint8_t *restrict dst, const char *restrict src, size_t len)
+{
+ assert(len != 0);
+
+ size_t i, j;
+ for (i = j = 0; len - i >= 8; i += 8, j += 5) {
+ uint8_t bits[] = {
+ lookup[(uint8_t)src[i + 0]],
+ lookup[(uint8_t)src[i + 1]],
+ lookup[(uint8_t)src[i + 2]],
+ lookup[(uint8_t)src[i + 3]],
+ lookup[(uint8_t)src[i + 4]],
+ lookup[(uint8_t)src[i + 5]],
+ lookup[(uint8_t)src[i + 6]],
+ lookup[(uint8_t)src[i + 7]],
+ };
+ if (!b32blktoa(dst + j, bits))
+ return false;
+ }
+
+ uint8_t bits[8] = {0};
+ switch (len - i) {
+ case 7: bits[6] = lookup[(uint8_t)src[i + 6]]; /* fallthrough */
+ case 6: bits[5] = lookup[(uint8_t)src[i + 5]]; /* fallthrough */
+ case 5: bits[4] = lookup[(uint8_t)src[i + 4]]; /* fallthrough */
+ case 4: bits[3] = lookup[(uint8_t)src[i + 3]]; /* fallthrough */
+ case 3: bits[2] = lookup[(uint8_t)src[i + 2]]; /* fallthrough */
+ case 2: bits[1] = lookup[(uint8_t)src[i + 1]]; /* fallthrough */
+ case 1: bits[0] = lookup[(uint8_t)src[i + 0]];
+ return b32blktoa(dst + j, bits);
+ }
+ return true;
+}
+
+bool
+b32blktoa(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+ uint8_t or = src[0] | src[1] | src[2] | src[3]
+ | src[4] | src[5] | src[6] | src[7];
+ if (or == 0xFF)
+ return false;
+
+ dst[0] = src[0]<<3 | src[1]>>2;
+ dst[1] = src[1]<<6 | src[2]<<1 | src[3]>>4;
+ dst[2] = src[3]<<4 | src[4]>>1;
+ dst[3] = src[4]<<7 | src[5]<<2 | src[6]>>3;
+ dst[4] = src[6]<<5 | src[7]>>0;
+ return true;
+}
diff --git a/src/base32.h b/src/base32.h
new file mode 100644
index 0000000..2581878
--- /dev/null
+++ b/src/base32.h
@@ -0,0 +1,10 @@
+#ifndef TOTP_BASE32_H
+#define TOTP_BASE32_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+bool b32toa(uint8_t *restrict, const char *restrict, size_t);
+
+#endif /* !TOTP_BASE32_H */
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..c1d21c8
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,27 @@
+#ifndef TOTP_COMMON_H
+#define TOTP_COMMON_H
+
+#if !__GNUC__
+# define __attribute__(x)
+#endif
+
+/* TODO: Is this endian stuff potentially useful? */
+
+/* If C23 or newer include this to get byte-order macros */
+#if __STDC_VERSION__ >= 202311L
+# include <stdbit.h>
+#endif
+
+#if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) \
+ || (defined(__STDC_ENDIAN_NATIVE__) \
+ && __STDC_ENDIAN_NATIVE__ == __STDC_ENDIAN_BIG__)
+# define ENDIAN_BIG 1
+#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+ || (defined(__STDC_ENDIAN_NATIVE__) \
+ && __STDC_ENDIAN_NATIVE__ == __STDC_ENDIAN_LITTLE__)
+# define ENDIAN_LITTLE 1
+#else
+# define ENDIAN_UNKNOWN 1
+#endif
+
+#endif /* !TOTP_COMMON_H */
diff --git a/src/hmac.c b/src/hmac.c
new file mode 100644
index 0000000..5175ee2
--- /dev/null
+++ b/src/hmac.c
@@ -0,0 +1,41 @@
+#include <string.h>
+
+#include "sha1.h"
+
+#define IPAD (0x36)
+#define OPAD (0x5C)
+
+void
+hmac_sha1(uint8_t *restrict out,
+ const uint8_t *restrict key, size_t keysz,
+ const uint8_t *restrict msg, size_t msgsz)
+{
+ uint8_t keyext[SHA1BLKSZ] = {0},
+ keyipad[SHA1BLKSZ],
+ keyopad[SHA1BLKSZ];
+
+ if (keysz > SHA1BLKSZ) {
+ sha1_t sha;
+ sha1init(&sha);
+ sha1hash(&sha, key, keysz);
+ sha1end(&sha, keyext);
+ } else
+ memcpy(keyext, key, keysz);
+
+ for (size_t i = 0; i < sizeof(keyext); i++) {
+ keyipad[i] = keyext[i] ^ IPAD;
+ keyopad[i] = keyext[i] ^ OPAD;
+ }
+
+ sha1_t sha;
+ uint8_t dgst[SHA1DGSTSZ];
+ sha1init(&sha);
+ sha1hash(&sha, keyipad, sizeof(keyipad));
+ sha1hash(&sha, msg, msgsz);
+ sha1end(&sha, dgst);
+
+ sha1init(&sha);
+ sha1hash(&sha, keyopad, sizeof(keyopad));
+ sha1hash(&sha, dgst, sizeof(dgst));
+ sha1end(&sha, out);
+}
diff --git a/src/hmac.h b/src/hmac.h
new file mode 100644
index 0000000..3c3e8e7
--- /dev/null
+++ b/src/hmac.h
@@ -0,0 +1,11 @@
+#ifndef TOTP_HMAC_H
+#define TOTP_HMAC_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void hmac_sha1(uint8_t *restrict,
+ const uint8_t *restrict, size_t,
+ const uint8_t *restrict, size_t);
+
+#endif /* !TOTP_HMAC_H */
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000..40318cf
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,191 @@
+#include <err.h>
+#include <errno.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdnoreturn.h>
+#include <string.h>
+#include <time.h>
+
+#include "base32.h"
+#include "common.h"
+#include "hmac.h"
+#include "sha1.h"
+#include "xendian.h"
+
+static void process(const char *, size_t);
+static void process_stdin(void);
+static inline uint32_t pow32(uint32_t, uint32_t)
+ __attribute__((always_inline, const));
+static inline bool xisdigit(char)
+ __attribute__((always_inline, const));
+static inline bool bigendian(void)
+ __attribute__((always_inline, const));
+
+static int digits = 6, period = 30;
+
+static noreturn void
+usage(const char *argv0)
+{
+ fprintf(stderr,
+ "Usage: %s [-d digits] [-p period] [secret]\n"
+ " %s -h\n",
+ argv0, argv0);
+ exit(EXIT_FAILURE);
+}
+
+int
+main(int argc, char **argv)
+{
+ int opt;
+ static const struct option longopts[] = {
+ {"digits", required_argument, 0, 'd'},
+ {"period", required_argument, 0, 'p'},
+ {0},
+ };
+
+ argv[0] = basename(argv[0]);
+ while ((opt = getopt_long(argc, argv, "d:p:", longopts, NULL)) != -1) {
+ switch (opt) {
+ case 'd':
+ case 'p': {
+ /* strtol() allows for numbers with leading spaces and a
+ ‘+’/‘-’. We don’t want that, so assert that the input
+ begins with a number. */
+ if (!xisdigit(optarg[0]))
+ errx(1, "%s: Invalid integer", optarg);
+
+ errno = 0;
+ char *endptr;
+ long n = strtol(optarg, &endptr, 10);
+
+ /* There are trailing invalid digits */
+ if (*endptr != 0)
+ errx(1, "%s: Invalid integer", optarg);
+
+ /* The number was too large. We asserted that the input
+ didn’t start with ‘-’ so we can ignore checking for
+ LONG_MIN. */
+ if (n > INT_MAX)
+ errno = ERANGE;
+ if (errno == ERANGE)
+ err(1, "%s", optarg);
+
+ if (n == 0)
+ errx(1, "%s: Integer must be non-zero", optarg);
+ if (opt == 'd')
+ digits = (int)n;
+ else
+ period = (int)n;
+ break;
+ }
+ default:
+ usage(argv[0]);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ switch (argc) {
+ case 0:
+ process_stdin();
+ break;
+ case 1:
+ process(argv[0], strlen(argv[0]));
+ break;
+ default:
+ usage(argv[-optind]);
+ }
+
+ return EXIT_SUCCESS;
+}
+
+void
+process_stdin(void)
+{
+ ssize_t nr;
+ size_t len;
+ char *line = NULL;
+ while ((nr = getline(&line, &len, stdin)) != -1) {
+ if (line[nr - 1] == '\n')
+ line[--nr] = 0;
+ process(line, nr);
+ }
+ if (errno != 0)
+ err(1, "getline");
+}
+
+void
+process(const char *s, size_t n)
+{
+ /* Remove padding bytes */
+ while (n > 0 && s[n - 1] == '=')
+ n--;
+ if (n == 0)
+ errx(1, "Empty Base32 input");
+
+ static uint8_t _key[256];
+ uint8_t *key = _key;
+
+ size_t keysz = n * 5 / 8;
+ if (keysz > sizeof(_key)) {
+ if ((key = malloc(keysz)) == NULL)
+ err(1, "malloc");
+ }
+
+ if (!b32toa(key, s, n))
+ errx(1, "%s: Invalid Base32 input", s);
+
+ /* time(2) claims that this call will never fail if passed a NULL
+ argument. We cast the time_t to uint64_t which will always be
+ safe to do. */
+ uint64_t epoch = htobe64((uint64_t)time(NULL) / (uint64_t)period);
+ uint8_t dgst[SHA1DGSTSZ];
+ hmac_sha1(dgst, key, keysz, (uint8_t *)&epoch, sizeof(epoch));
+
+ int off = dgst[19] & 0x0F;
+ uint32_t binc = (dgst[off + 0] & 0x7F) << 24
+ | (dgst[off + 1] & 0xFF) << 16
+ | (dgst[off + 2] & 0xFF) << 8
+ | (dgst[off + 3] & 0xFF) << 0;
+ printf("%0*" PRId32 "\n", digits, binc % pow32(10, digits));
+
+ if (key != _key)
+ free(key);
+}
+
+/* TODO: Check for overflow? */
+uint32_t
+pow32(uint32_t x, uint32_t y)
+{
+ uint32_t n = x;
+ if (y == 0)
+ return 1;
+ while (--y != 0)
+ x *= n;
+ return x;
+}
+
+bool
+xisdigit(char ch)
+{
+ return ch >= '0' && ch <= '9';
+}
+
+bool
+bigendian(void)
+{
+ union {
+ uint16_t u16;
+ uint8_t u8[2];
+ } u = {
+ .u16 = 0x0102,
+ };
+ u.u16 = 0x0102U;
+ return u.u8[0] == 1;
+}
diff --git a/src/sha1-generic.c b/src/sha1-generic.c
new file mode 100644
index 0000000..d897a8f
--- /dev/null
+++ b/src/sha1-generic.c
@@ -0,0 +1,78 @@
+#include "common.h"
+#include "sha1.h"
+#include "xendian.h"
+
+static inline uint32_t rotl32(uint32_t x, uint8_t bits)
+ __attribute__((always_inline, const));
+
+static const uint32_t K[] = {
+ 0x5A827999,
+ 0x6ED9EBA1,
+ 0x8F1BBCDC,
+ 0xCA62C1D6,
+};
+
+void
+sha1hashblk(sha1_t *s, const uint8_t *blk)
+{
+ uint32_t w[80];
+ uint32_t a, b, c, d, e, tmp;
+
+ for (int i = 0; i < 16; i++)
+ w[i] = htobe32(((uint32_t *)blk)[i]);
+ for (int i = 16; i < 32; i++)
+ w[i] = rotl32(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16], 1);
+ for (int i = 32; i < 80; i++)
+ w[i] = rotl32(w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32], 2);
+
+ a = s->dgst[0];
+ b = s->dgst[1];
+ c = s->dgst[2];
+ d = s->dgst[3];
+ e = s->dgst[4];
+
+ for (int i = 0; i < 80; i++) {
+ uint32_t f, k;
+
+ if (i < 20) {
+ f = b&c | ~b&d;
+ k = K[0];
+ } else if (i < 40) {
+ f = b ^ c ^ d;
+ k = K[1];
+ } else if (i < 60) {
+ f = b&c | b&d | c&d;
+ k = K[2];
+ } else {
+ f = b ^ c ^ d;
+ k = K[3];
+ }
+
+ tmp = rotl32(a, 5) + f + e + w[i] + k;
+ e = d;
+ d = c;
+ c = rotl32(b, 30);
+ b = a;
+ a = tmp;
+ }
+
+ s->dgst[0] += a;
+ s->dgst[1] += b;
+ s->dgst[2] += c;
+ s->dgst[3] += d;
+ s->dgst[4] += e;
+}
+
+uint32_t
+rotl32(uint32_t x, uint8_t bits)
+{
+#if (__GNUC__ || __TINYC__) && __x86_64__
+ __asm__ ("roll %1, %0" : "+r" (x) : "c" (bits) : "cc");
+ return x;
+#elif __GNUC__ && __aarch64__ /* TODO: Test this! */
+ __asm__ ("ror %0, %0, %1" : "+r" (x) : "c" (-bits));
+ return x;
+#else
+ return (x << bits) | (x >> (32 - bits));
+#endif
+}
diff --git a/src/sha1-x64.c b/src/sha1-x64.c
new file mode 100644
index 0000000..be19ab7
--- /dev/null
+++ b/src/sha1-x64.c
@@ -0,0 +1,98 @@
+#include <immintrin.h>
+
+#include "sha1.h"
+
+#define R(mi, mj, mk, ml, ei, ej, f) \
+ do { \
+ ei = _mm_sha1nexte_epu32(ei, mi); \
+ ej = abcd; \
+ mj = _mm_sha1msg2_epu32(mj, mi); \
+ abcd = _mm_sha1rnds4_epu32(abcd, ei, f); \
+ ml = _mm_sha1msg1_epu32(ml, mi); \
+ mk = _mm_xor_si128(mk, mi); \
+ } while (0)
+
+void
+sha1hashblk(sha1_t *s, const uint8_t *blk)
+{
+ __m128i abcd, e0, e1;
+ __m128i abcd_save, e_save;
+ __m128i msg0, msg1, msg2, msg3;
+
+ /* Masks for swapping endianness. We make BSWAPDMSK a macro to
+ please the compiler (it wants immediate values). */
+#define bswapdmsk 0x1B /* 0b00'01'10'11 */
+ const __m128i bswapbmsk = _mm_set_epi64x(
+ 0x0001020304050607ULL,
+ 0x08090a0b0c0d0e0fULL
+ );
+
+ const __m128i *blkx = (const __m128i *)blk;
+
+ abcd = _mm_shuffle_epi32(_mm_loadu_si128((__m128i *)s->dgst), bswapdmsk);
+ e0 = _mm_set_epi32(s->dgst[4], 0, 0, 0);
+
+ abcd_save = abcd;
+ e_save = e0;
+
+ /* Rounds 0–3 */
+ msg0 = _mm_shuffle_epi8(_mm_loadu_si128(blkx + 0), bswapbmsk);
+ e0 = _mm_add_epi32(e0, msg0);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+
+ /* Rounds 4–7 */
+ msg1 = _mm_shuffle_epi8(_mm_loadu_si128(blkx + 1), bswapbmsk);
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+
+ /* Rounds 8–11 */
+ msg2 = _mm_shuffle_epi8(_mm_loadu_si128(blkx + 2), bswapbmsk);
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ msg3 = _mm_shuffle_epi8(_mm_loadu_si128(blkx + 3), bswapbmsk);
+ R(msg3, msg0, msg1, msg2, e1, e0, 0); /* Rounds 12–15 */
+ R(msg0, msg1, msg2, msg3, e0, e1, 0); /* Rounds 16–19 */
+ R(msg1, msg2, msg3, msg0, e1, e0, 1); /* Rounds 20–23 */
+ R(msg2, msg3, msg0, msg1, e0, e1, 1); /* Rounds 24–27 */
+ R(msg3, msg0, msg1, msg2, e1, e0, 1); /* Rounds 28–31 */
+ R(msg0, msg1, msg2, msg3, e0, e1, 1); /* Rounds 32–35 */
+ R(msg1, msg2, msg3, msg0, e1, e0, 1); /* Rounds 36–39 */
+ R(msg2, msg3, msg0, msg1, e0, e1, 2); /* Rounds 40–43 */
+ R(msg3, msg0, msg1, msg2, e1, e0, 2); /* Rounds 44–47 */
+ R(msg0, msg1, msg2, msg3, e0, e1, 2); /* Rounds 48–51 */
+ R(msg1, msg2, msg3, msg0, e1, e0, 2); /* Rounds 52–55 */
+ R(msg2, msg3, msg0, msg1, e0, e1, 2); /* Rounds 56–59 */
+ R(msg3, msg0, msg1, msg2, e1, e0, 3); /* Rounds 60–63 */
+ R(msg0, msg1, msg2, msg3, e0, e1, 3); /* Rounds 64–67 */
+
+ /* Rounds 68–71 */
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ /* Rounds 72–75 */
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+
+ /* Rounds 76–79 */
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+
+ e0 = _mm_sha1nexte_epu32(e0, e_save);
+ abcd = _mm_add_epi32(abcd, abcd_save);
+
+ _mm_storeu_si128((__m128i *)s->dgst, _mm_shuffle_epi32(abcd, bswapdmsk));
+ s->dgst[4] = _mm_extract_epi32(e0, 3);
+}
diff --git a/src/sha1.c b/src/sha1.c
new file mode 100644
index 0000000..5759991
--- /dev/null
+++ b/src/sha1.c
@@ -0,0 +1,76 @@
+#include <err.h>
+#include <errno.h>
+#include <string.h>
+
+#include "sha1.h"
+#include "xendian.h"
+
+#define lengthof(x) (sizeof(x) / sizeof(*(x)))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+void sha1hashblk(sha1_t *, const uint8_t *);
+
+void
+sha1init(sha1_t *s)
+{
+ static const uint32_t H[] = {
+ 0x67452301,
+ 0xEFCDAB89,
+ 0x98BADCFE,
+ 0x10325476,
+ 0xC3D2E1F0,
+ };
+ memcpy(s->dgst, H, sizeof(H));
+ s->msgsz = s->bufsz = 0;
+}
+
+void
+sha1hash(sha1_t *s, const uint8_t *msg, size_t msgsz)
+{
+ if (s->msgsz + (msgsz * 8) < s->msgsz) {
+ errno = EOVERFLOW;
+ err(1, "sha1");
+ }
+
+ s->msgsz += msgsz * 8;
+
+ while (msgsz != 0) {
+ size_t free_space = SHA1BLKSZ - s->bufsz;
+ size_t ncpy = MIN(msgsz, free_space);
+ memcpy(s->buf + s->bufsz, msg, ncpy);
+ s->bufsz += ncpy;
+ msg += ncpy;
+ msgsz -= ncpy;
+
+ if (s->bufsz == SHA1BLKSZ) {
+ sha1hashblk(s, s->buf);
+ s->bufsz = 0;
+ }
+ }
+}
+
+void
+sha1end(sha1_t *s, uint8_t *dgst)
+{
+ s->buf[s->bufsz++] = 0x80;
+
+ if (s->bufsz > SHA1BLKSZ - sizeof(uint64_t)) {
+ while (s->bufsz < SHA1BLKSZ)
+ s->buf[s->bufsz++] = 0;
+ sha1hashblk(s, s->buf);
+ s->bufsz = 0;
+ }
+
+ while (s->bufsz < 56)
+ s->buf[s->bufsz++] = 0;
+ uint64_t n = htobe64(s->msgsz);
+ memcpy(s->buf + (SHA1BLKSZ/8 - 1)*sizeof(uint64_t), &n, sizeof(n));
+
+ sha1hashblk(s, s->buf);
+
+ for (size_t i = 0; i < lengthof(s->dgst); i++) {
+ /* Pretty please compiler optimize this */
+ uint32_t n = htobe32(s->dgst[i]);
+ memcpy(dgst + i*sizeof(uint32_t), &n, sizeof(n));
+ }
+}
diff --git a/src/sha1.h b/src/sha1.h
new file mode 100644
index 0000000..ea08d37
--- /dev/null
+++ b/src/sha1.h
@@ -0,0 +1,21 @@
+#ifndef TOTP_SHA1_H
+#define TOTP_SHA1_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define SHA1DGSTSZ (20)
+#define SHA1BLKSZ (64)
+
+typedef struct {
+ uint32_t dgst[SHA1DGSTSZ / sizeof(uint32_t)];
+ uint64_t msgsz;
+ uint8_t buf[SHA1BLKSZ];
+ size_t bufsz;
+} sha1_t;
+
+void sha1init(sha1_t *);
+void sha1hash(sha1_t *, const uint8_t *, size_t);
+void sha1end(sha1_t *, uint8_t *);
+
+#endif /* !TOTP_SHA1_H */
diff --git a/src/xendian.h b/src/xendian.h
new file mode 100644
index 0000000..b43661f
--- /dev/null
+++ b/src/xendian.h
@@ -0,0 +1,21 @@
+#ifndef TOTP_XENDIAN_H
+#define TOTP_XENDIAN_H
+
+/* This header grabs the htobe64() and co. functions in a more
+ cross-platform manner. In general you will find these functions in
+ <sys/endian.h>, however Linux and OpenBSD include them in <endian.h>.
+ To make things even better this header doesn’t exist on MacOS so we
+ need to define wrapper macros for the htonXX() functions from
+ <arpa/inet.h>. */
+
+#if defined(__OpenBSD__) || defined(__linux__)
+# include <endian.h>
+#elif defined(__APPLE__)
+# include <arpa/inet.h>
+# define htobe32(x) htonl(x)
+# define htobe64(x) htonll(x)
+#else
+# include <sys/endian.h>
+#endif
+
+#endif /* !TOTP_XENDIAN_H */