package main import ( "bytes" "encoding/binary" "fmt" "io" "strconv" "strings" ) type section struct { name string name_shstrtabOffset int buff *bytes.Buffer } type symtabEntry struct { // The index of this symbol within the whole symtab symtabSectionIndex int name_shstrtabOffset int sectionName string kind string offset int64 length int64 global bool } type compiler struct { symtab map[string]symtabEntry sections []section currentSection *section shstrtab *section } func NewCompiler() *compiler { c := &compiler{ symtab: map[string]symtabEntry{}, } // Fake 0th entry // First, there's an all-zero entry that is reserved for extended ELF headers c.sections = append(c.sections, section{}) // Real entry: shstrtab c.sections = append(c.sections, section{ name: `.shstrtab`, // Mandatory: the table that names sections themselves buff: &bytes.Buffer{}, }) c.shstrtab = &c.sections[1] // The first byte in a string table is conventionally expected be \x00, so that you can reference // null strings with it c.StringTable("") c.shstrtab.name_shstrtabOffset = c.StringTable(c.shstrtab.name) return c } func (c *compiler) StringTable(text string) int { pos := c.shstrtab.buff.Len() c.shstrtab.buff.WriteString(text) c.shstrtab.buff.WriteByte(0) return pos } func (c *compiler) CreateSymbol(name string, class string, offset int64, length int64, global bool) error { if _, ok := c.symtab[name]; ok { return fmt.Errorf("Symbol %q already exists", name) } // fmt.Printf("--> CreateSymbol(%s)\n", name) // Find the .symtab section, or create if it does not exist symtabSec := c.FindOrCreateSection(`.symtab`) if symtabSec.buff.Len() == 0 { // First time initialized // Add a zeroth symtab entry - zero is a sentinel, not a usable entry symtabSec.buff.Write(make([]byte, 8*3)) } // New entry index = length / len(entry) = length / 24 nextIndex := symtabSec.buff.Len() / 24 // Add to our fast lookup table ste := symtabEntry{ symtabSectionIndex: nextIndex, kind: class, offset: offset, global: global, length: length, } // Find the section index for the section containing this symbol var srcSectionIdx int = 0 var sttType uint8 = STT_NOTYPE if class == `.section` { ste.sectionName = name srcSectionIdx = len(c.sections) - 1 // The most recent added section sttType = STT_SECTION } else if c.currentSection != nil { ste.sectionName = c.currentSection.name var ok bool srcSectionIdx, ok = c.FindSectionIndex(c.currentSection.name) if !ok { panic("current section does not exist?") } } else { panic("Symbol is neither a section, nor within a section (?)") } // Add to the .symtab section // This is required for variable references - after our single ELF .o is // created, linking it with any other .o files will create a combined .text // section where all the offsets have shifted esym := Elf64_Sym{} esym.St_value = uint64(offset) if class == `.section` { esym.St_name = uint32(c.StringTable(name)) // Write name into public string table esym.St_info = sttType | (STB_LOCAL << 4) esym.St_other = STV_DEFAULT esym.St_shndx = uint16(srcSectionIdx) } else if global { esym.St_name = uint32(c.StringTable(name)) // Write name into public string table esym.St_info = sttType | (STB_GLOBAL << 4) esym.St_other = STV_DEFAULT esym.St_shndx = uint16(srcSectionIdx) } else { // Private variable for this translation unit // Needs an entry, but no need to expose the name esym.St_name = 0 // uint32(c.StringTable(name)) // Write name into public string table // 0 // Default: unnamed (0th entry in our string table is \x00) esym.St_info = sttType | (STB_LOCAL << 4) esym.St_other = STV_HIDDEN // For this translation unit only esym.St_shndx = uint16(srcSectionIdx) } fmt.Printf("-->New symbol %q in section %q (sectionidx %v)\n", name, ste.sectionName, srcSectionIdx) esym.St_size = uint64(length) err := binary.Write(symtabSec.buff, binary.LittleEndian, &esym) if err != nil { return err } // Stash in symtabEntry ste.name_shstrtabOffset = int(esym.St_name) c.symtab[name] = ste return nil } func (c *compiler) Must(b []byte) { n, err := c.currentSection.buff.Write(b) if err != nil { panic(err) } if n != len(b) { panic(fmt.Errorf("Must: %w", io.ErrShortWrite)) } } func (c *compiler) MustUint64(val uint64) { ret := make([]byte, 8) binary.LittleEndian.PutUint64(ret, val) c.Must(ret) } func (c *compiler) FindSectionIndex(sectionName string) (int, bool) { for i, sec := range c.sections { if sec.name != sectionName { continue } // found it return i, true } return 0, false } func (c *compiler) FindOrCreateSection(sectionName string) *section { if len(sectionName) == 0 || sectionName[0] != '.' { panic("section name should start with leading period") } if i, ok := c.FindSectionIndex(sectionName); ok { return &c.sections[i] } // No section with this name. Create it c.sections = append(c.sections, section{ name: sectionName, buff: &bytes.Buffer{}, }) sec := &c.sections[len(c.sections)-1] // Create a symbol for it // This creates a string table entry for us err := c.CreateSymbol(sectionName, ".section", 0, 0, true) if err != nil { panic("CreateSymbol: " + err.Error()) } sec.name_shstrtabOffset = c.StringTable(sectionName) return sec } func (c *compiler) Reloc(symbolName string, mode ElfRelocationType) error { // Find '.rela.{currentsection}', creating it if it does not exist var relaSec *section = c.FindOrCreateSection(`.rela` + c.currentSection.name) // Find target symbol syminfo, ok := c.symtab[symbolName] if !ok { return fmt.Errorf("Reference to unknown symbol %q", symbolName) } // Find the symbol pointing to its parent section /* parentSectionSyminfo, ok := c.symtab[syminfo.sectionName] if !ok { return fmt.Errorf("Bad parent section") } fmt.Printf("-->Relocation %q found in %q (sectionidx %d)\n", symbolName, syminfo.sectionName, parentSectionSyminfo.symtabSectionIndex) rootSymbol := parentSectionSyminfo.symtabSectionIndex if rootSymbol == 5 { rootSymbol = 7 } */ rootSymbol := syminfo.symtabSectionIndex // Add the relocation to the .rela section rr := Elf64_Rela{} rr.R_offset = uint64(c.currentSection.buff.Len()) rr.R_info = uint64(rootSymbol)<<32 | uint64(mode) // high bits: Index of search symbol in the symtab (the source section). low bits: mode type rr.R_addend = 0 // syminfo.offset // Add to the result when relocating (offset within source section) err := binary.Write(relaSec.buff, binary.LittleEndian, &rr) if err != nil { return err } // Done return nil } func (c *compiler) Compile(t Token) error { if c.currentSection == nil { // The only allowable token outside of a section is to start a new section if _, ok := t.(SectionToken); !ok { return fmt.Errorf("Need to start with a section token, got %#t", t) } } switch tok := t.(type) { case SectionToken: c.currentSection = c.FindOrCreateSection(tok.SectionName) return nil case DataVariableInstrToken: // Stash in symbol table for future backreferences // TODO allow making global symbols? // CreateSymbol does check for duplicate names already position := int64(c.currentSection.buff.Len()) // Generate bytes for the symbol switch tok.Sizeclass { case "u8": // 1 byte literal val, err := strconv.ParseUint(tok.Value, 10, 8) if err != nil { return err } c.Must([]byte{byte(val)}) case "u64": // 8-byte literal val, err := strconv.ParseUint(tok.Value, 10, 64) if err != nil { return err } c.MustUint64(val) case "sz": // string with null termination ret := []byte(tok.Value) ret = append(ret, 0) c.Must(ret) default: return fmt.Errorf("variable %q has unknown size class %q", tok.VarName, tok.Sizeclass) } err := c.CreateSymbol(tok.VarName, ".var."+tok.Sizeclass, int64(position), int64(c.currentSection.buff.Len())-position, false) if err != nil { return err } return nil case LabelToken: return c.CreateSymbol(tok.LabelName, ".label", int64(c.currentSection.buff.Len()), 0, tok.IsGlobal) case MovInstrToken: // TODO encode more cases properly if literal, err := strconv.ParseInt(tok.Args[1], 10, 64); err == nil { // mov rxx, imm // Store immediate in register switch tok.Args[0] { case "rax": c.Must([]byte{0x48, 0xb8}) // TODO store in eax with shorter prefix if <32 bit c.MustUint64(uint64(literal)) case "rbx": c.Must([]byte{0x48, 0xbb}) // TODO store in eax with shorter prefix if <32 bit c.MustUint64(uint64(literal)) case "rcx": c.Must([]byte{0x48, 0xb9}) // TODO store in eax with shorter prefix if <32 bit c.MustUint64(uint64(literal)) case "rdx": c.Must([]byte{0x48, 0xba}) // TODO store in eax with shorter prefix if <32 bit c.MustUint64(uint64(literal)) case "rsi": c.Must([]byte{0x48, 0xbe}) // TODO store in eax with shorter prefix if <32 bit c.MustUint64(uint64(literal)) case "rdi": c.Must([]byte{0x48, 0xbf}) // TODO store in eax with shorter prefix if <32 bit c.MustUint64(uint64(literal)) default: // Store immediate in variable? panic("mov rxx,imm pattern: missing case") } return nil } else if strings.HasPrefix(tok.Args[0], `$`) { // mov $var, rxx // Load register's contents into variable // x86_64 can only really do this in a single instruction with 32-bit displacement, not full 64-bit // The PIC alternative is to transform this into `lea symbol(%rip), %rdi` switch tok.Args[1] { case "rax": c.Must([]byte{0x48, 0x89, 0x04, 0x25}) default: panic("mov $var,rax pattern: missing case") } err = c.Reloc(tok.Args[0][1:], R_X86_64_32S) // Declare that this is a 32-bit reloc, not a 64-bit one if err != nil { return fmt.Errorf("mov with relocation: %w", err) } c.Must([]byte{0, 0, 0, 0}) // 32-bit return nil } else if strings.HasPrefix(tok.Args[1], `$`) { // mov rxx, $var // With $; load variable contents into register switch tok.Args[0] { case "rax": c.Must([]byte{0x48, 0x8b, 0x04, 0x25}) case "rdi": c.Must([]byte{0x48, 0x8b, 0x3c, 0x25}) default: panic("mov rxx,$var pattern: missing case") } err = c.Reloc(tok.Args[1][1:], R_X86_64_32S) // Declare that this is a 32-bit reloc, not a 64-bit one if err != nil { return fmt.Errorf("mov with relocation: %w", err) } c.Must([]byte{0, 0, 0, 0}) // 32-bit return nil } else if strings.HasPrefix(tok.Args[1], `&$`) { // mov rxx, &$var // With &; assign exact address of variable to register // This creates a movabs literal & a relocation entry // It's always 64-bit switch tok.Args[0] { case "rax": c.Must([]byte{0x48, 0xb8}) // TODO store in eax with shorter prefix if <32 bit case "rsi": c.Must([]byte{0x48, 0xbe}) // TODO store in eax with shorter prefix if <32 bit case "rdi": c.Must([]byte{0x48, 0xbf}) // TODO store in eax with shorter prefix if <32 bit default: panic("mov $var,rxx pattern: missing case") } err = c.Reloc(tok.Args[1][2:], R_X86_64_64) if err != nil { return fmt.Errorf("mov with relocation: %w", err) } c.MustUint64(0) return nil } else if strings.HasPrefix(tok.Args[1], `strlen($`) && strings.HasSuffix(tok.Args[1], `)`) { // mov rxx, strlen($var) // With strlen; if this is an sz symbol, supply its length symname := tok.Args[1][8 : len(tok.Args[1])-1] sym, ok := c.symtab[symname] if !ok { return fmt.Errorf("Can't strlen on unknown variable %q", symname) } if sym.kind != ".var.sz" { return fmt.Errorf("Can't take the strlen of variable %q with type %q (expected sz)", symname, sym.kind) } effective := sym.length return c.Compile(MovInstrToken{Args: []string{tok.Args[0], strconv.Itoa(int(effective))}}) } else { panic("unknown mov type, sorry") } case SyscallInstrToken: c.Must([]byte{0x0f, 0x05}) // syscall return nil case RetInstrToken: c.Must([]byte{0xc3}) // ret return nil case NopInstrToken: c.Must([]byte{0x90}) // nop return nil default: return fmt.Errorf("can't compile token of type %#t", t) } } // Finalize exports the compiled sections into an ELF artefact. // The resulting ELF is not executable directly, but it can be once fully // linked (adding a program header and page alignment) func (c *compiler) Finalize(dest io.Writer) error { // Find some well-known section indexes symtabSectionIndex, ok := c.FindSectionIndex(`.symtab`) if !ok { return fmt.Errorf("No symbol table present") } shstrtabSectionIndex, ok := c.FindSectionIndex(`.shstrtab`) if !ok { return fmt.Errorf("No string table present") } // (Safely) move all global symtab to the end // Because there may be existing references to global symtab entries (e.g. relocs) // just duplicate them in place tmp := c.sections[symtabSectionIndex].buff.Bytes() extraSymtabContent := bytes.Buffer{} for i := 0; i < len(tmp); i += 24 { sym := Elf64_Sym{} err := binary.Read(bytes.NewReader(tmp[i:i+24]), binary.LittleEndian, &sym) if err != nil { return err } if sym.St_info&(STB_GLOBAL<<4) == 0 { continue // not a global symbol } // Was a global symbol // Re-add the global symbol at the end extraSymtabContent.Write(tmp[i : i+24]) // Patch the existing symbol sym.St_name = 0 sym.St_info &= ^uint8(STB_GLOBAL << 4) replacement := bytes.Buffer{} err = binary.Write(&replacement, binary.LittleEndian, &sym) if err != nil { return err } copy(tmp[i:i+24], replacement.Bytes()) } numLocalSymbols := len(tmp) / 24 c.sections[symtabSectionIndex].buff.Write(extraSymtabContent.Bytes()) // Write ELF header ehdr := Elf64_Ehdr{} ehdr.e_ident[0] = 0x7f ehdr.e_ident[1] = 'E' ehdr.e_ident[2] = 'L' ehdr.e_ident[3] = 'F' ehdr.e_ident[4] = 2 // 64-bit format ehdr.e_ident[5] = 1 // little endian ehdr.e_ident[6] = 1 // ELFv1 is the only format ehdr.e_ident[7] = 0 // Don't declare any ABI ehdr.e_type = ET_REL ehdr.e_machine = 0x3E // x86_64 ehdr.e_version = 1 // ELFv1 again //ehdr.e_flags = 11 // ???? ehdr.e_ehsize = 64 ehdr.e_shoff = 64 // The Ehdr is 64 bytes long, sections start immediately following ehdr.e_shentsize = 64 // Each Shdr is also 64 bytes long ehdr.e_shnum = uint16(len(c.sections)) ehdr.e_shstrndx = uint16(shstrtabSectionIndex) err := binary.Write(dest, binary.LittleEndian, &ehdr) if err != nil { return err } // Don't declare a program header // Write fake 0th section header dest.Write(make([]byte, 64)) // Write remaining section headers pctr := 64 + (64 * len(c.sections)) for _, sec := range c.sections[1:] { shdr := Elf64_Shdr{} shdr.sh_name = uint32(sec.name_shstrtabOffset) switch sec.name { case ".text": shdr.sh_type = SHT_PROGBITS shdr.sh_flags = SHF_ALLOC | SHF_EXECINSTR shdr.sh_addralign = 16 // Request for final linking case ".data": shdr.sh_type = SHT_PROGBITS shdr.sh_flags = SHF_WRITE | SHF_ALLOC shdr.sh_addralign = 4 // Request for final linking case ".symtab": shdr.sh_type = SHT_SYMTAB shdr.sh_flags = 0 shdr.sh_info = uint32(numLocalSymbols) // sh_info points to the first global symbol. Global symbols must go after local symbols shdr.sh_entsize = 24 // Size in bytes of each entry shdr.sh_link = 1 // The index of the section containing the actual strings. We reuse shstrtab(!?!) shdr.sh_addralign = 8 // Request for final linking case ".shstrtab": shdr.sh_type = SHT_STRTAB shdr.sh_flags = 0 shdr.sh_addralign = 1 // Not doing any proper alignment case ".rodata": shdr.sh_type = SHT_PROGBITS shdr.sh_flags = SHF_ALLOC shdr.sh_addralign = 4 // Request for final linking default: if strings.HasPrefix(sec.name, ".rela.") { shdr.sh_type = SHT_RELA shdr.sh_flags = 0 // ? shdr.sh_link = uint32(symtabSectionIndex) shdr.sh_entsize = 24 // Size in bytes of each entry // Find the index of the section for which this relocates. Match by name srcSectionIdx, ok := c.FindSectionIndex(sec.name[5:]) if !ok { return fmt.Errorf("Missing parent section for relocation section %q", sec.name) } shdr.sh_info = uint32(srcSectionIdx) shdr.sh_addralign = 8 // Request for final linking } else { return fmt.Errorf("don't know the right flags to use for section %q", sec.name) } } shdr.sh_offset = uint64(pctr) shdr.sh_size = uint64(sec.buff.Len()) pctr += sec.buff.Len() err = binary.Write(dest, binary.LittleEndian, &shdr) if err != nil { return err } } // Write binary content for _, sec := range c.sections[1:] { expectLen := sec.buff.Len() n, err := sec.buff.WriteTo(dest) if err != nil { return err } if n != int64(expectLen) { return io.ErrShortWrite } } // Done return nil }