From 3b52f1d3860b87eacd2b79e2d7f9109999e3f9a9 Mon Sep 17 00:00:00 2001
From: PoroCYon <3253268+PoroCYon@users.noreply.github.com>
Date: Mon, 10 Dec 2018 12:47:43 +0000
Subject: [PATCH] Optimize e.asm

---
 e.asm | 63 +++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 12 deletions(-)

diff --git a/e.asm b/e.asm
index 0dc586d..552fc09 100644
--- a/e.asm
+++ b/e.asm
@@ -1,17 +1,56 @@
+; compile with:
+; $ [ny]asm -felf(32|64) -oe.o e.asm
+; $ (gcc|clang) -m(32|64) -oe e.o -nostdlib -nostartfiles
+
 section     .text
-global      _start                              
+global      _start
 
-_start:                                         
+%if __BITS__ == 32
+%define r(n) e%+n
+%define SYS_write 4
+%define rarg0 ebx
+%define rarg1 ecx
+%define rarg2 edx
+%define syscall int 0x80
+%else
+%define r(n) r%+n
+%define SYS_write 1
+%define rarg0 rdi
+%define rarg1 rsi
+%define rarg2 rdx
+default rel
+%endif
 
-	mov rax, 1
-	mov rdi, 1
-	mov rsi, msg
-	mov rdx, len
-	loop:
-	syscall
-	jmp loop                             
+; size of a Linux pipe buffer
+%define PIPE_SIZE 0x10000
+%define STDOUT_FILENO 1
 
-section     .data
+; Instead of simply storing a char in .rodata and write(2)-ing it
+; over and over again, we first fill a buffer full of e's, and *then*
+; write the entire buffer. This is much faster than the first option,
+; because we only need to issue a syscall once every 65536 bytes. (Remember
+; that doing a syscall requires the kernel to handle an interrupt etc etc etc.)
 
-msg: db "e"                 
-len: equ $ - msg                             
+_start:
+        ; allocate space for the message
+        mov r(cx), PIPE_SIZE
+        mov r(bx), r(cx) ; we'll need it later
+        sub r(sp), r(cx)
+
+        ; quick memset(3)
+        mov al, 'e'
+        mov r(di), r(sp)
+        rep stosb
+
+        ; push+pop is actually a smaller encoding than mov for ints that fit within 8 bit
+        push STDOUT_FILENO
+        pop rarg0
+        mov rarg1, r(sp)
+        mov rarg2, r(bx)
+
+.loop:
+        ; set this within the loop because the syscall's exit code is placed in r(ax)
+        push SYS_write
+        pop r(ax)
+        syscall
+        jmp short .loop