pxasme/compile.go

456 lines
12 KiB
Go

package main
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"strconv"
"strings"
)
type section struct {
name string
name_shstrtabOffset int
buff bytes.Buffer
}
type symtabEntry struct {
symtabSectionIndex int
sectionName string
kind string
offset int64
length int64
global bool
}
type compiler struct {
symtab map[string]symtabEntry
sections []section
currentSection *section
shstrtab *section
}
func NewCompiler() *compiler {
c := &compiler{
symtab: map[string]symtabEntry{},
}
c.sections = append(c.sections, section{
name: `.shstrtab`, // Mandatory: the table that names sections themselves
name_shstrtabOffset: 1,
buff: bytes.Buffer{},
})
c.shstrtab = &c.sections[0]
// The first byte in a string table is conventionally expected be \x00, so that you can reference
// null strings with it
c.shstrtab.buff.WriteByte(0)
c.shstrtab.buff.WriteString(c.shstrtab.name)
c.shstrtab.buff.WriteByte(0)
return c
}
func (c *compiler) CreateSymbol(name string, class string, offset int64, length int64, global bool) error {
if _, ok := c.symtab[name]; ok {
return fmt.Errorf("Symbol %q already exists", name)
}
// Find the .symtab section, or create if it does not exist
symtabSec := c.FindOrCreateSection(`.symtab`)
// New entry index = length / len(entry) = length / 24
nextIndex := symtabSec.buff.Len() / 24
// Add to our fast lookup table
c.symtab[name] = symtabEntry{
symtabSectionIndex: nextIndex,
sectionName: c.currentSection.name,
kind: class,
offset: offset,
global: global,
length: length,
}
// Find the section index for the section containing this symbol
sectionIndex := -1
for i, _ := range c.sections {
if c.sections[i].name == c.currentSection.name {
sectionIndex = i
break
}
}
if sectionIndex == -1 {
return fmt.Errorf("Current section missing index")
}
// Add to the .symtab section
// This is required for variable references - after our single ELF .o is
// created, linking it with any other .o files will create a combined .text
// section where all the offsets have shifted
esym := Elf64_Sym{}
esym.st_name = 0 // Default: unnamed
esym.st_info = STT_SECTION | (STB_LOCAL << 4)
esym.st_other = STV_HIDDEN // For this translation unit only
esym.st_shndx = uint16(sectionIndex)
esym.st_size = uint64(length)
err := binary.Write(&symtabSec.buff, binary.LittleEndian, &esym)
return err
}
func (c *compiler) Must(b []byte) {
n, err := c.currentSection.buff.Write(b)
if err != nil {
panic(err)
}
if n != len(b) {
panic(fmt.Errorf("Must: %w", io.ErrShortWrite))
}
}
func (c *compiler) MustUint64(val uint64) {
ret := make([]byte, 8)
binary.LittleEndian.PutUint64(ret, val)
c.Must(ret)
}
func (c *compiler) FindOrCreateSection(sectionName string) *section {
if len(sectionName) == 0 || sectionName[0] != '.' {
panic("section name should start with leading period")
}
for i, sec := range c.sections {
if sec.name != sectionName {
continue
}
// found it
return &c.sections[i]
}
// No section with this name. Create it
c.sections = append(c.sections, section{
name: sectionName,
name_shstrtabOffset: c.shstrtab.buff.Len(),
buff: bytes.Buffer{},
})
c.shstrtab.buff.WriteString(sectionName)
c.shstrtab.buff.WriteByte(0)
return &c.sections[len(c.sections)-1]
}
func (c *compiler) Reloc(symbolName string, mode ElfRelocationType, addOffset int64) error {
// Find '.rela.{currentsection}', creating it if it does not exist
var relaSec *section = c.FindOrCreateSection(`.rela` + c.currentSection.name)
// Find target symbol
syminfo, ok := c.symtab[symbolName]
if !ok {
return fmt.Errorf("Reference to unknown symbol %q", symbolName)
}
// Add the relocation to the .rela section
rr := Elf64_Rela{}
rr.r_offset = uint64(c.currentSection.buff.Len())
rr.r_info = uint64(syminfo.symtabSectionIndex)<<32 | uint64(mode) // high bits: Index of search symbol in the symtab. low bits: mode type
rr.r_addend = addOffset
err := binary.Write(&relaSec.buff, binary.LittleEndian, &rr)
if err != nil {
return err
}
// Done
return nil
}
func (c *compiler) Compile(t Token) error {
if c.currentSection == nil {
// The only allowable token outside of a section is to start a new section
if _, ok := t.(SectionToken); !ok {
return fmt.Errorf("Need to start with a section token, got %#t", t)
}
}
switch tok := t.(type) {
case SectionToken:
c.currentSection = c.FindOrCreateSection(tok.SectionName)
return nil
case DataVariableInstrToken:
// Stash in symbol table for future backreferences
// TODO allow making global symbols?
// CreateSymbol does check for duplicate names already
position := int64(c.currentSection.buff.Len())
// Generate bytes for the symbol
switch tok.Sizeclass {
case "u8":
// 1 byte literal
val, err := strconv.ParseUint(tok.Value, 10, 8)
if err != nil {
return err
}
c.Must([]byte{byte(val)})
case "u64":
// 8-byte literal
val, err := strconv.ParseUint(tok.Value, 10, 64)
if err != nil {
return err
}
c.MustUint64(val)
case "sz":
// string with null termination
ret := []byte(tok.Value)
ret = append(ret, 0)
c.Must(ret)
default:
return fmt.Errorf("variable %q has unknown size class %q", tok.VarName, tok.Sizeclass)
}
err := c.CreateSymbol(tok.VarName, ".var."+tok.Sizeclass, int64(c.currentSection.buff.Len()), position-int64(c.currentSection.buff.Len()), false)
if err != nil {
return err
}
return nil
case LabelToken:
return c.CreateSymbol(tok.LabelName, ".label", int64(c.currentSection.buff.Len()), 0, tok.IsGlobal)
case MovInstrToken:
// TODO encode more cases properly
if literal, err := strconv.ParseInt(tok.Args[1], 10, 64); err == nil {
// mov rxx, imm
// Store immediate in register
switch tok.Args[0] {
case "rax":
c.Must([]byte{0x48, 0xb8}) // TODO store in eax with shorter prefix if <32 bit
c.MustUint64(uint64(literal))
case "rbx":
c.Must([]byte{0x48, 0xbb}) // TODO store in eax with shorter prefix if <32 bit
c.MustUint64(uint64(literal))
case "rcx":
c.Must([]byte{0x48, 0xb9}) // TODO store in eax with shorter prefix if <32 bit
c.MustUint64(uint64(literal))
case "rdx":
c.Must([]byte{0x48, 0xba}) // TODO store in eax with shorter prefix if <32 bit
c.MustUint64(uint64(literal))
case "rsi":
c.Must([]byte{0x48, 0xbe}) // TODO store in eax with shorter prefix if <32 bit
c.MustUint64(uint64(literal))
case "rdi":
c.Must([]byte{0x48, 0xbf}) // TODO store in eax with shorter prefix if <32 bit
c.MustUint64(uint64(literal))
default:
// Store immediate in variable?
panic("mov rxx,imm pattern: missing case")
}
return nil
} else if strings.HasPrefix(tok.Args[0], `$`) {
// mov $var, rxx
// Load register's contents into variable
// x86_64 can only really do this in a single instruction with 32-bit displacement, not full 64-bit
switch tok.Args[1] {
case "rax":
c.Must([]byte{0x48, 0x89, 0x04, 0x25})
default:
panic("mov $var,rax pattern: missing case")
}
err = c.Reloc(tok.Args[0][1:], R_X86_64_32S, 0) // Declare that this is a 32-bit reloc, not a 64-bit one
if err != nil {
return fmt.Errorf("mov with relocation: %w", err)
}
c.Must([]byte{0, 0, 0, 0}) // 32-bit
return nil
} else if strings.HasPrefix(tok.Args[1], `$`) {
// mov rxx, $var
// With $; load variable contents into register
switch tok.Args[0] {
case "rax":
c.Must([]byte{0x48, 0x8b, 0x04, 0x25})
case "rdi":
c.Must([]byte{0x48, 0x8b, 0x3c, 0x25})
default:
panic("mov rxx,$var pattern: missing case")
}
err = c.Reloc(tok.Args[1][1:], R_X86_64_32S, 0) // Declare that this is a 32-bit reloc, not a 64-bit one
if err != nil {
return fmt.Errorf("mov with relocation: %w", err)
}
c.Must([]byte{0, 0, 0, 0}) // 32-bit
return nil
} else if strings.HasPrefix(tok.Args[1], `&$`) {
// mov rxx, &$var
// With &; assign exact address of variable to register
// This creates a movabs literal & a relocation entry
// It's always 64-bit
switch tok.Args[0] {
case "rax":
c.Must([]byte{0x48, 0xb8}) // TODO store in eax with shorter prefix if <32 bit
case "rsi":
c.Must([]byte{0x48, 0xbe}) // TODO store in eax with shorter prefix if <32 bit
case "rdi":
c.Must([]byte{0x48, 0xbf}) // TODO store in eax with shorter prefix if <32 bit
default:
panic("mov $var,rxx pattern: missing case")
}
err = c.Reloc(tok.Args[1][2:], R_X86_64_64, 0)
if err != nil {
return fmt.Errorf("mov with relocation: %w", err)
}
c.MustUint64(0)
return nil
} else if strings.HasPrefix(tok.Args[1], `strlen($`) && strings.HasSuffix(tok.Args[1], `)`) {
// mov rxx, strlen($var)
// With strlen; if this is an sz symbol, supply its length
symname := tok.Args[1][8 : len(tok.Args[1])-1]
sym, ok := c.symtab[symname]
if !ok {
return fmt.Errorf("Can't strlen on unknown variable %q", symname)
}
if sym.kind != ".var.sz" {
return fmt.Errorf("Can't take the strlen of variable %q with type %q (expected sz)", symname, sym.kind)
}
effective := sym.length
return c.Compile(MovInstrToken{Args: []string{tok.Args[0], strconv.Itoa(int(effective))}})
} else {
panic("unknown mov type, sorry")
}
case SyscallInstrToken:
c.Must([]byte{0x0f, 0x05}) // syscall
return nil
case RetInstrToken:
c.Must([]byte{0xc3}) // ret
return nil
default:
return fmt.Errorf("can't compile token of type %#t", t)
}
}
// Finalize exports the compiled sections into an ELF artefact.
// The resulting ELF is not executable directly, but it can be once fully
// linked (adding a program header and page alignment)
func (c *compiler) Finalize(dest io.Writer) error {
// Write ELF header
ehdr := Elf64_Ehdr{}
ehdr.e_ident[0] = 0x7f
ehdr.e_ident[1] = 'E'
ehdr.e_ident[2] = 'L'
ehdr.e_ident[3] = 'F'
ehdr.e_ident[4] = 2 // 64-bit format
ehdr.e_ident[5] = 1 // little endian
ehdr.e_ident[6] = 1 // ELFv1 is the only format
ehdr.e_ident[7] = 3 // Linux-compatible ABI
ehdr.e_type = 0 // ET_NONE
ehdr.e_machine = 0x3E // x86_64
ehdr.e_version = 1 // ELFv1 again
ehdr.e_shoff = 64 // The Ehdr is 64 bytes long, sections start immediately following
ehdr.e_shentsize = 64 // Each Shdr is also 64 bytes long
ehdr.e_shnum = uint16(len(c.sections))
ehdr.e_shstrndx = 0 // We always put the .shstrtab as the 0th section
err := binary.Write(dest, binary.LittleEndian, &ehdr)
if err != nil {
return err
}
// Don't declare a program header
// Write section headers
pctr := 64 + (64 * len(c.sections))
for _, sec := range c.sections {
shdr := Elf64_Shdr{}
shdr.sh_name = uint32(sec.name_shstrtabOffset)
switch sec.name {
case ".text":
shdr.sh_type = 1 // SHT_PROGBITS, program data
shdr.sh_flags = 0x2 | 0x4 | 0x10 // WRITE|ALLOC|MERGE
case ".data":
shdr.sh_type = 1 // SHT_PROGBITS, program data
shdr.sh_flags = 0x2 | 0x10 // WRITE|MERGE
case ".symtab":
shdr.sh_type = 2 // SHT_SYMTAB
shdr.sh_flags = 0x10 | 0x20 // MERGE|STRINGS
case ".shstrtab":
shdr.sh_type = 3 // SHT_STRTAB
shdr.sh_flags = 0x10 | 0x20 // MERGE|STRINGS
case ".rodata":
fallthrough
default: // Treat anything unknown as read-only data
shdr.sh_type = 1 // SHT_PROGBITS, program data
shdr.sh_flags = 0x10 // MERGE
}
shdr.sh_offset = uint64(pctr)
shdr.sh_size = uint64(sec.buff.Len())
pctr += sec.buff.Len()
err = binary.Write(dest, binary.LittleEndian, &shdr)
if err != nil {
return err
}
}
// Write binary content
for _, sec := range c.sections {
expectLen := sec.buff.Len()
n, err := sec.buff.WriteTo(dest)
if err != nil {
return err
}
if n != int64(expectLen) {
return io.ErrShortWrite
}
}
// Done
return nil
}