NewBasic Compiler -- An x86 Compiler/Assembler/Disassembler for DOS

Using segment overrides in your assembly output

With the C language, almost any general memory pointer will access memory relative to the base of the program it is contained in, due to the app (usually the operating systems kernel) that loaded it, and use the DS segment.  For example, let's consider the following bit of code:

/* create an integer and a pointer in the global address space */
int  i;
int  *p;

int main(int argc, char *argv[]) {

  /* assign (point) p to the address of i */
  p = &i;

  /* assign i a value of 1 via p (example 0)*/
  *p = 1;

  /* assign i a value of 1 via p (example 1) */
  p[0] = 1;
  
  /* return the value in i (should be '1') */
  return i;
}

In the example above, NBC will create something similar to the following assembly code:
.model tiny
.code
.386

i  dd  0
p  dd  0

main:
  push   ebp
  mov    ebp,esp

  mov    eax,offset i
  mov    [p],eax
  mov    eax,[p]
  mov    ebx,eax
  mov    eax,1
  mov    [ebx],eax
  mov    eax,[p]
  mov    ebx,eax
  mov    eax,1
  mov    [ebx],eax
  mov    eax,[i]
  
  mov    esp,ebp
  pop    ebp
  ret  

.end

Notice how all memory accesses do not use an override and do use the default/assumed DS: segment. 

If I was writing a general program used with an already created and loaded operating system, this would be the correct output.  However, since I am writing code for a loader file, where there is no pre-loaded operating system, I might make sure that all memory accesses are from a base other than where the normal start of the program might be.

For example, what if I wanted to access the BIOS Data Area at physical address 0x00400?  In assembly, I would do the following:
  push   es
  mov    ax,0000h
  mov    es,ax
  mov    ax,es:[0400h]
  pop    es

What about in C?
  int  i;
  int *p;
  
  /* assign (point) p to the physical address 0x00400 */
  p = (int *) 0x00400;
  
  /* retrieve the value at 0x00400 */
  i = *p;

The code above looks like it should work as expected.  However, the machine word retrieved from *p is actually from DS:[00400h].  If the value in DS is 0x0000, then all is good.  However, most of the time, the value is not 0x0000.

To fix this, you could do the following:
  int  i;
  int *p;
  
  /* assign (point) p to the physical address 0x00400 */
  p = (int *) 0x00400;
  
  /* modify DS to point to 0x00000 */
  _asm (
    "  push  ds \n"
    "  mov   ax,0000h \n"
    "  mov   ds,ax \n"
  );
  
  /* retrieve the value at 0x00400 */
  i = *p;   /*  <---- This does not do what you think it does */
  
  /* restore DS */
  _asm (
    "  pop  ds \n"
  );

However, the value returned in the 'i = *p' statement will not be stored where you think it should.  The address that 'i' populates, will not be modified due to the default address change of DS.  In fact, an address other than &i will be modified!!!

Also, what if you had multiple places to access physical memory?  You would have to place a lot of _asm() statements in your code.  There has to be an easier way!!!

This is where the modification to the compiler came to play.  Take the above example, with the new modification:
  int  i;
  int farE *p; /* Notice the additional keyword */
  
  /* modify ES (once) to point to 0x00000
     then use ES: throughout the remaining of 
     our code for this purpose. */
  _asm (
    "  push  es \n"
    "  mov   ax,0000h \n"
    "  mov   es,ax    \n"
  );
  
  /* assign (point) p to the physical address es:0x00400 */
  p = (int farE *) 0x00400;
  
  /* retrieve the value at 0x00400 */
  i = *p;
  
  /* restore ES */
  _asm (
    "  pop  es \n"
  );

And the compiled output:
  push  es 
  mov   ax,0000h 
  mov   es,ax    
  
  mov    eax,0x0400
  mov    [ebp-8],eax   ; store 0x0400 into 'p'
  
  mov    eax,[ebp-8]   ; retrieve 0x0400 from 'p'
  mov    ebx,eax       ; place it in a mem-register
  mov    eax,es:[ebx]  ; retrieve the machine word from ES:[0x0400]  
                       ; *** notice the ES: override ***
  
  mov    [ebp-4],eax   ; store it in 'i'  *** Notice no segment override ****
  
  pop  es 

Any memory access using 'p' from this point on will include the 'ES:' override.  It doesn't matter what type is used nor how it is accessed:
  int  i;
  unsigned char farE *p;
  
  /* modify ES (once) to point to 0x00000
     then use ES: throughout the remaining of 
     our code for this purpose. */
  _asm (
    "  push  es \n"
    "  mov   ax,0000h \n"
    "  mov   es,ax    \n"
  );
  
  /* assign (point) p to the physical address es:0x00400 */
  p = (unsigned char farE *) 0x00400;
  
  /* retrieve the value at 0x00400 */
  i = p[0];
  
  /* restore ES */
  _asm (
    "  pop  es \n"
  );

Notice above now a 'byte' sized access is used, along with []'s instead of '*'.
  push  es 
  mov   ax,0000h 
  mov   es,ax    
  
  mov    eax,0x0400
  mov    [ebp-8],eax   ; store 0x0400 into 'p'
  
  mov    eax,[ebp-8]   ; retrieve 0x0400 from 'p'
  mov    ebx,eax       ; place it in a mem-register
  mov    al,es:[ebx]   ; retrieve the byte from ES:[0x0400]  
                       ; *** notice the ES: override ***
  movzx  eax,al        ; convert to the size of 'i'
  
  mov    [ebp-4],eax   ; store it in 'i'  *** Notice no segment override ****
  
  pop  es 

How about one more example using multiple overrides:
  int  i, k;
  unsigned char farE *p;
  unsigned int farF *t;
  
  /* modify ES (once) to point to 0x00000
     then use ES: throughout the remaining of 
     our code for this purpose.
     Similar for FS. */
  _asm (
    "  push  es \n"
    "  push  fs \n"
    "  mov   ax,0000h \n"
    "  mov   es,ax    \n"
    "  mov   ax,1234h \n"
    "  mov   fs,ax    \n"
  );
  
  /* assign (point) p to the physical address es:0x00400 */
  p = (unsigned char farE *) 0x00400;
  
  /* assign (point) t to the physical address fs:0x04321 */
  t = (unsigned int farF *) 0x04321;
  
  /* retrieve the value at es:0x00400 */
  i = p[0];
  
  /* retrieve the value at fs:0x04321 */
  k = *t;
  
  /* restore ES and FS*/
  _asm (
    "  pop  fs \n"
    "  pop  es \n"
  );

And the compiled output:
  push  es 
  push  fs 
  mov   ax,0000h 
  mov   es,ax    
  mov   ax,1234h 
  mov   fs,ax    

  mov    eax,0x0400
  mov    [ebp-12],eax   ; store 0x0400 into 'p'
  
  mov    eax,0x4321
  mov    [ebp-16],eax   ; store 0x4321 into 't'
  
  mov    eax,[ebp-12]   ; retrieve 0x0400 from 'p'
  mov    ebx,eax        ; place it in a mem-register

  mov    al,es:[ebx]    ; retrieve the byte from ES:[0x0400]  
                        ; *** notice the ES: override ***
  movzx  eax,al         ; convert to the size of 'i'
  mov    [ebp-4],eax    ; store it in 'i'  *** Notice no segment override ****
  
  mov    eax,[ebp-16]   ; retrieve 0x4321 from 't'
  mov    ebx,eax        ; place it in a mem-register

  mov    eax,fs:[ebx]   ; retrieve the machine word from FS:[0x4321]  
  mov    [ebp-8],eax    ; store it in 'k'  *** Notice no segment override ****
  
  pop  fs 
  pop  es

Please Note:

You must still assign a value to any segment register used, however, you no longer have to worry about the segment override in your assembly output.
You may use the 'farC', 'farD', 'farE', 'farF', and 'farG' additional keywords.  (Note that 'farD' is not required until you know that the normal access would be other than DS:.  For example, you might cast an already defined farE to a farD.)