diff --git a/compile.go b/compile.go index 1f150a8..b35d88e 100644 --- a/compile.go +++ b/compile.go @@ -6,17 +6,22 @@ import ( "fmt" "io" "strconv" + "strings" ) type section struct { - name string - buff bytes.Buffer + name string + name_shstrtabOffset int + buff bytes.Buffer } type symtabEntry struct { + symtabSectionIndex int + sectionName string kind string offset int64 + length int64 global bool } @@ -24,12 +29,78 @@ type compiler struct { symtab map[string]symtabEntry sections []section currentSection *section + shstrtab *section } func NewCompiler() *compiler { - return &compiler{ - symtab: map[string]symtabEntry{}, // starts out empty + c := &compiler{ + symtab: map[string]symtabEntry{}, } + + c.sections = append(c.sections, section{ + name: `.shstrtab`, // Mandatory: the table that names sections themselves + name_shstrtabOffset: 1, + buff: bytes.Buffer{}, + }) + c.shstrtab = &c.sections[0] + // The first byte in a string table is conventionally expected be \x00, so that you can reference + // null strings with it + c.shstrtab.buff.WriteByte(0) + + c.shstrtab.buff.WriteString(c.shstrtab.name) + c.shstrtab.buff.WriteByte(0) + + return c +} + +func (c *compiler) CreateSymbol(name string, class string, offset int64, length int64, global bool) error { + + if _, ok := c.symtab[name]; ok { + return fmt.Errorf("Symbol %q already exists", name) + } + + // Find the .symtab section, or create if it does not exist + symtabSec := c.FindOrCreateSection(`.symtab`) + + // New entry index = length / len(entry) = length / 24 + nextIndex := symtabSec.buff.Len() / 24 + + // Add to our fast lookup table + c.symtab[name] = symtabEntry{ + symtabSectionIndex: nextIndex, + + sectionName: c.currentSection.name, + kind: class, + offset: offset, + global: global, + length: length, + } + + // Find the section index for the section containing this symbol + sectionIndex := -1 + for i, _ := range c.sections { + if c.sections[i].name == c.currentSection.name { + sectionIndex = i + break + } + } + if sectionIndex == -1 { + return fmt.Errorf("Current section missing index") + } + + // Add to the .symtab section + // This is required for variable references - after our single ELF .o is + // created, linking it with any other .o files will create a combined .text + // section where all the offsets have shifted + esym := Elf64_Sym{} + esym.st_name = 0 // Default: unnamed + esym.st_info = STT_SECTION | (STB_LOCAL << 4) + esym.st_other = STV_HIDDEN // For this translation unit only + esym.st_shndx = uint16(sectionIndex) + esym.st_size = uint64(length) + + err := binary.Write(&symtabSec.buff, binary.LittleEndian, &esym) + return err } func (c *compiler) Must(b []byte) { @@ -38,7 +109,7 @@ func (c *compiler) Must(b []byte) { panic(err) } if n != len(b) { - panic(io.ErrShortWrite) + panic(fmt.Errorf("Must: %w", io.ErrShortWrite)) } } @@ -48,6 +119,59 @@ func (c *compiler) MustUint64(val uint64) { c.Must(ret) } +func (c *compiler) FindOrCreateSection(sectionName string) *section { + + if len(sectionName) == 0 || sectionName[0] != '.' { + panic("section name should start with leading period") + } + + for i, sec := range c.sections { + if sec.name != sectionName { + continue + } + + // found it + return &c.sections[i] + } + + // No section with this name. Create it + c.sections = append(c.sections, section{ + name: sectionName, + name_shstrtabOffset: c.shstrtab.buff.Len(), + buff: bytes.Buffer{}, + }) + + c.shstrtab.buff.WriteString(sectionName) + c.shstrtab.buff.WriteByte(0) + + return &c.sections[len(c.sections)-1] +} + +func (c *compiler) Reloc(symbolName string, mode ElfRelocationType, addOffset int64) error { + // Find '.rela.{currentsection}', creating it if it does not exist + var relaSec *section = c.FindOrCreateSection(`.rela` + c.currentSection.name) + + // Find target symbol + syminfo, ok := c.symtab[symbolName] + if !ok { + return fmt.Errorf("Reference to unknown symbol %q", symbolName) + } + + // Add the relocation to the .rela section + rr := Elf64_Rela{} + rr.r_offset = uint64(c.currentSection.buff.Len()) + rr.r_info = uint64(syminfo.symtabSectionIndex)<<32 | uint64(mode) // high bits: Index of search symbol in the symtab. low bits: mode type + rr.r_addend = addOffset + + err := binary.Write(&relaSec.buff, binary.LittleEndian, &rr) + if err != nil { + return err + } + + // Done + return nil +} + func (c *compiler) Compile(t Token) error { if c.currentSection == nil { // The only allowable token outside of a section is to start a new section @@ -58,36 +182,15 @@ func (c *compiler) Compile(t Token) error { switch tok := t.(type) { case SectionToken: - - // Check if we are resuming an existing section - for i, sec := range c.sections { - if sec.name == tok.SectionName { - // Found it - c.currentSection = &c.sections[i] - return nil - } - } - - // It's a new section - c.sections = append(c.sections, section{ - name: tok.SectionName, - buff: bytes.Buffer{}, - }) - c.currentSection = &c.sections[len(c.sections)-1] - + c.currentSection = c.FindOrCreateSection(tok.SectionName) return nil case DataVariableInstrToken: // Stash in symbol table for future backreferences - if _, ok := c.symtab[tok.VarName]; ok { - return fmt.Errorf("variable %q was already defined", tok.VarName) - } - c.symtab[tok.VarName] = symtabEntry{ - sectionName: c.currentSection.name, - kind: ".var." + tok.Sizeclass, - offset: int64(c.currentSection.buff.Len()), - global: false, // TODO allow this? - } + // TODO allow making global symbols? + // CreateSymbol does check for duplicate names already + + position := int64(c.currentSection.buff.Len()) // Generate bytes for the symbol switch tok.Sizeclass { @@ -99,7 +202,6 @@ func (c *compiler) Compile(t Token) error { } c.Must([]byte{byte(val)}) - return nil case "u64": // 8-byte literal @@ -109,36 +211,32 @@ func (c *compiler) Compile(t Token) error { } c.MustUint64(val) - return nil case "sz": // string with null termination ret := []byte(tok.Value) ret = append(ret, 0) c.Must(ret) - return nil default: return fmt.Errorf("variable %q has unknown size class %q", tok.VarName, tok.Sizeclass) } - case LabelToken: - if _, ok := c.symtab[tok.LabelName]; ok { - return fmt.Errorf("name %q was already defined", tok.LabelName) - } - c.symtab[tok.LabelName] = symtabEntry{ - sectionName: c.currentSection.name, - kind: ".label", - offset: int64(c.currentSection.buff.Len()), - global: tok.IsGlobal, + err := c.CreateSymbol(tok.VarName, ".var."+tok.Sizeclass, int64(c.currentSection.buff.Len()), position-int64(c.currentSection.buff.Len()), false) + if err != nil { + return err } return nil + case LabelToken: + return c.CreateSymbol(tok.LabelName, ".label", int64(c.currentSection.buff.Len()), 0, tok.IsGlobal) + case MovInstrToken: // TODO encode more cases properly if literal, err := strconv.ParseInt(tok.Args[1], 10, 64); err == nil { - + // mov rxx, imm // Store immediate in register + switch tok.Args[0] { case "rax": c.Must([]byte{0x48, 0xb8}) // TODO store in eax with shorter prefix if <32 bit @@ -156,35 +254,202 @@ func (c *compiler) Compile(t Token) error { c.Must([]byte{0x48, 0xba}) // TODO store in eax with shorter prefix if <32 bit c.MustUint64(uint64(literal)) + case "rsi": + c.Must([]byte{0x48, 0xbe}) // TODO store in eax with shorter prefix if <32 bit + c.MustUint64(uint64(literal)) + + case "rdi": + c.Must([]byte{0x48, 0xbf}) // TODO store in eax with shorter prefix if <32 bit + c.MustUint64(uint64(literal)) + default: // Store immediate in variable? - panic("not implemented: store immediate in ???? thing") + panic("mov rxx,imm pattern: missing case") + } + return nil + + } else if strings.HasPrefix(tok.Args[0], `$`) { + // mov $var, rxx + // Load register's contents into variable + // x86_64 can only really do this in a single instruction with 32-bit displacement, not full 64-bit + + switch tok.Args[1] { + case "rax": + c.Must([]byte{0x48, 0x89, 0x04, 0x25}) + default: + panic("mov $var,rax pattern: missing case") } - } else if _, ok := c.symtab[tok.Args[1]]; ok { - // Store variable's contents in register + err = c.Reloc(tok.Args[0][1:], R_X86_64_32S, 0) // Declare that this is a 32-bit reloc, not a 64-bit one + if err != nil { + return fmt.Errorf("mov with relocation: %w", err) + } + c.Must([]byte{0, 0, 0, 0}) // 32-bit + return nil - } else if _, ok := c.symtab["&"+tok.Args[1]]; ok { - // With &; store address of variable in register + } else if strings.HasPrefix(tok.Args[1], `$`) { + // mov rxx, $var + // With $; load variable contents into register + + switch tok.Args[0] { + case "rax": + c.Must([]byte{0x48, 0x8b, 0x04, 0x25}) + case "rdi": + c.Must([]byte{0x48, 0x8b, 0x3c, 0x25}) + default: + panic("mov rxx,$var pattern: missing case") + } + + err = c.Reloc(tok.Args[1][1:], R_X86_64_32S, 0) // Declare that this is a 32-bit reloc, not a 64-bit one + if err != nil { + return fmt.Errorf("mov with relocation: %w", err) + } + c.Must([]byte{0, 0, 0, 0}) // 32-bit + + return nil + + } else if strings.HasPrefix(tok.Args[1], `&$`) { + // mov rxx, &$var + // With &; assign exact address of variable to register + // This creates a movabs literal & a relocation entry + // It's always 64-bit + + switch tok.Args[0] { + case "rax": + c.Must([]byte{0x48, 0xb8}) // TODO store in eax with shorter prefix if <32 bit + case "rsi": + c.Must([]byte{0x48, 0xbe}) // TODO store in eax with shorter prefix if <32 bit + case "rdi": + c.Must([]byte{0x48, 0xbf}) // TODO store in eax with shorter prefix if <32 bit + + default: + panic("mov $var,rxx pattern: missing case") + } + + err = c.Reloc(tok.Args[1][2:], R_X86_64_64, 0) + if err != nil { + return fmt.Errorf("mov with relocation: %w", err) + } + + c.MustUint64(0) + return nil + + } else if strings.HasPrefix(tok.Args[1], `strlen($`) && strings.HasSuffix(tok.Args[1], `)`) { + // mov rxx, strlen($var) + // With strlen; if this is an sz symbol, supply its length + symname := tok.Args[1][8 : len(tok.Args[1])-1] + sym, ok := c.symtab[symname] + if !ok { + return fmt.Errorf("Can't strlen on unknown variable %q", symname) + } + + if sym.kind != ".var.sz" { + return fmt.Errorf("Can't take the strlen of variable %q with type %q (expected sz)", symname, sym.kind) + } + + effective := sym.length + return c.Compile(MovInstrToken{Args: []string{tok.Args[0], strconv.Itoa(int(effective))}}) + + } else { + panic("unknown mov type, sorry") } - panic("unknown mov type, sorry") + case SyscallInstrToken: + c.Must([]byte{0x0f, 0x05}) // syscall + return nil + + case RetInstrToken: + c.Must([]byte{0xc3}) // ret + return nil default: return fmt.Errorf("can't compile token of type %#t", t) } } +// Finalize exports the compiled sections into an ELF artefact. +// The resulting ELF is not executable directly, but it can be once fully +// linked (adding a program header and page alignment) func (c *compiler) Finalize(dest io.Writer) error { - const alignment = 4096 - // Write ELF header - // Write section headers - // Write binary content - // Pad out section to page alignment - // Done + ehdr := Elf64_Ehdr{} + ehdr.e_ident[0] = 0x7f + ehdr.e_ident[1] = 'E' + ehdr.e_ident[2] = 'L' + ehdr.e_ident[3] = 'F' + ehdr.e_ident[4] = 2 // 64-bit format + ehdr.e_ident[5] = 1 // little endian + ehdr.e_ident[6] = 1 // ELFv1 is the only format + ehdr.e_ident[7] = 3 // Linux-compatible ABI + + ehdr.e_type = 0 // ET_NONE + ehdr.e_machine = 0x3E // x86_64 + ehdr.e_version = 1 // ELFv1 again + + ehdr.e_shoff = 64 // The Ehdr is 64 bytes long, sections start immediately following + ehdr.e_shentsize = 64 // Each Shdr is also 64 bytes long + ehdr.e_shnum = uint16(len(c.sections)) + ehdr.e_shstrndx = 0 // We always put the .shstrtab as the 0th section + + err := binary.Write(dest, binary.LittleEndian, &ehdr) + if err != nil { + return err + } + + // Don't declare a program header + + // Write section headers + pctr := 64 + (64 * len(c.sections)) + for _, sec := range c.sections { + shdr := Elf64_Shdr{} + + shdr.sh_name = uint32(sec.name_shstrtabOffset) + switch sec.name { + case ".text": + shdr.sh_type = 1 // SHT_PROGBITS, program data + shdr.sh_flags = 0x2 | 0x4 | 0x10 // WRITE|ALLOC|MERGE + case ".data": + shdr.sh_type = 1 // SHT_PROGBITS, program data + shdr.sh_flags = 0x2 | 0x10 // WRITE|MERGE + case ".symtab": + shdr.sh_type = 2 // SHT_SYMTAB + shdr.sh_flags = 0x10 | 0x20 // MERGE|STRINGS + case ".shstrtab": + shdr.sh_type = 3 // SHT_STRTAB + shdr.sh_flags = 0x10 | 0x20 // MERGE|STRINGS + case ".rodata": + fallthrough + default: // Treat anything unknown as read-only data + shdr.sh_type = 1 // SHT_PROGBITS, program data + shdr.sh_flags = 0x10 // MERGE + } + + shdr.sh_offset = uint64(pctr) + shdr.sh_size = uint64(sec.buff.Len()) + + pctr += sec.buff.Len() + + err = binary.Write(dest, binary.LittleEndian, &shdr) + if err != nil { + return err + } + } + + // Write binary content + for _, sec := range c.sections { + expectLen := sec.buff.Len() + n, err := sec.buff.WriteTo(dest) + if err != nil { + return err + } + if n != int64(expectLen) { + return io.ErrShortWrite + } + } + + // Done + return nil - panic("TODO") } diff --git a/elf.go b/elf.go index 48a82ba..83e26ef 100644 --- a/elf.go +++ b/elf.go @@ -1,5 +1,6 @@ package main +// Elf64_Ehdr is the main ELF header type Elf64_Ehdr struct { e_ident [16]byte e_type uint16 @@ -17,6 +18,7 @@ type Elf64_Ehdr struct { e_shstrndx uint16 } +// Elf64_Phdr is the Program Header type Elf64_Phdr struct { p_type uint32 p_flags uint32 @@ -28,6 +30,7 @@ type Elf64_Phdr struct { p_align uint64 } +// Elf64_Shdr is the Section header type Elf64_Shdr struct { sh_name uint32 sh_type uint32 @@ -40,3 +43,83 @@ type Elf64_Shdr struct { sh_addralign uint64 sh_entsize uint64 } + +const ( + STB_LOCAL = 0 + STB_GLOBAL = 1 + STB_WEAK = 2 + + STT_NOTYPE = 0 + STT_OBJECT = 1 + STT_FUNC = 2 + STT_SECTION = 3 + STT_FILE = 4 + STT_COMMON = 5 + STT_TLS = 6 + + STV_DEFAULT = 0 + STV_INTERNAL = 1 + STV_HIDDEN = 2 + STV_PROTECTED = 3 +) + +// Elf64_Sym is a symbol +type Elf64_Sym struct { + st_name uint32 + st_info byte + st_other byte + st_shndx uint16 + st_value uint64 + st_size uint64 +} + +// Elf64_Rela is a relocation with addend +type Elf64_Rela struct { + r_offset uint64 + r_info uint64 + r_addend int64 +} + +// Relocation types +type ElfRelocationType int + +const ( + R_X86_64_NONE ElfRelocationType = 0 + R_X86_64_64 ElfRelocationType = 1 + R_X86_64_PC32 ElfRelocationType = 2 + R_X86_64_GOT32 ElfRelocationType = 3 + R_X86_64_PLT32 ElfRelocationType = 4 + R_X86_64_COPY ElfRelocationType = 5 + R_X86_64_GLOB_DAT ElfRelocationType = 6 + R_X86_64_JUMP_SLOT ElfRelocationType = 7 + R_X86_64_RELATIVE ElfRelocationType = 8 + R_X86_64_GOTPCREL ElfRelocationType = 9 + R_X86_64_32 ElfRelocationType = 10 + R_X86_64_32S ElfRelocationType = 11 + R_X86_64_16 ElfRelocationType = 12 + R_X86_64_PC16 ElfRelocationType = 13 + R_X86_64_8 ElfRelocationType = 14 + R_X86_64_PC8 ElfRelocationType = 15 + R_X86_64_DTPMOD64 ElfRelocationType = 16 + R_X86_64_DTPOFF64 ElfRelocationType = 17 + R_X86_64_TPOFF64 ElfRelocationType = 18 + R_X86_64_TLSGD ElfRelocationType = 19 + R_X86_64_TLSLD ElfRelocationType = 20 + R_X86_64_DTPOFF32 ElfRelocationType = 21 + R_X86_64_GOTTPOFF ElfRelocationType = 22 + R_X86_64_TPOFF32 ElfRelocationType = 23 + R_X86_64_PC64 ElfRelocationType = 24 + R_X86_64_GOTOFF64 ElfRelocationType = 25 + R_X86_64_GOTPC32 ElfRelocationType = 26 + R_X86_64_GOT64 ElfRelocationType = 27 + R_X86_64_GOTPCREL64 ElfRelocationType = 28 + R_X86_64_GOTPC64 ElfRelocationType = 29 + R_X86_64_GOTPLT64 ElfRelocationType = 30 + R_X86_64_PLTOFF64 ElfRelocationType = 31 + R_X86_64_SIZE32 ElfRelocationType = 32 + R_X86_64_SIZE64 ElfRelocationType = 33 + R_X86_64_GOTPC32_TLSDESC ElfRelocationType = 34 + R_X86_64_TLSDESC_CALL ElfRelocationType = 35 + R_X86_64_TLSDESC ElfRelocationType = 36 + R_X86_64_IRELATIVE ElfRelocationType = 37 +) diff --git a/lexer.go b/lexer.go index 49127df..215d8f3 100644 --- a/lexer.go +++ b/lexer.go @@ -77,6 +77,9 @@ func (l *lexer) Next() (Token, error) { case "syscall": return SyscallInstrToken{}, nil + case "ret": + return RetInstrToken{}, nil + default: // If the field ends with `:`, it's a (local) label if strings.HasSuffix(fields[0], `:`) { diff --git a/main.go b/main.go index 9c13f45..efb895c 100644 --- a/main.go +++ b/main.go @@ -27,13 +27,14 @@ func assemble(src io.Reader, dest io.Writer) { lx := NewLexer(src) cc := NewCompiler() +mainloop: for { tok, err := lx.Next() if err != nil { if errors.Is(err, io.EOF) { // Reached EOF // Terminate compilation - panic("Completed OK") + break mainloop } // Real error @@ -48,4 +49,9 @@ func assemble(src io.Reader, dest io.Writer) { } } + + err := cc.Finalize(dest) + if err != nil { + panic(err) + } } diff --git a/main_test.go b/main_test.go index ca3733e..32c215f 100644 --- a/main_test.go +++ b/main_test.go @@ -1,7 +1,8 @@ package main import ( - "io/ioutil" + // "io/ioutil" + "os" "strings" "testing" ) @@ -10,9 +11,11 @@ func TestCompile(t *testing.T) { // @ref https://gist.github.com/armicron/e891709ce8893df2fd5fc74c846dcf20 const src = ` -section .data +section .rodata $msg = sz "Hello, world\n" $filename = sz "test.txt" + +section .data $fd = u64 0 section .text @@ -25,7 +28,7 @@ global _start: ;tell linker entry point syscall mov $fd, rax - mov rdx, 13 ;message strlen + mov rdx, strlen($msg) ;message strlen mov rsi, &$msg ;message to write mov rdi, $fd ;file descriptor mov rax, 1 ;system call number (sys_write) @@ -40,6 +43,15 @@ global _start: ;tell linker entry point ` - assemble(strings.NewReader(src), ioutil.Discard) + /* + assemble(strings.NewReader(src), ioutil.Discard) + */ + + fh, err := os.OpenFile("output.o", os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + panic(err) + } + + assemble(strings.NewReader(src), fh) // ioutil.Discard) } diff --git a/test_assemble.sh b/test_assemble.sh index 806736c..e49d198 100755 --- a/test_assemble.sh +++ b/test_assemble.sh @@ -3,6 +3,7 @@ echo "$1" > src.asm nasm -f elf64 src.asm +objdump -x src.o objdump -D src.o rm src.o rm src.asm diff --git a/token.go b/token.go index 6ca0217..1373340 100644 --- a/token.go +++ b/token.go @@ -21,6 +21,8 @@ type MovInstrToken struct { type SyscallInstrToken struct{} +type RetInstrToken struct{} + type DataVariableInstrToken struct { VarName string Sizeclass string // sz, u8, u16, u32, u64