Malicious RTF Detection POC

# This is a POC for detecting malicious RTF documents. The two algorithms are simple
# The first one counts the amount of non-ASCII data in a file and the second 
# calculates the entropy of ASCII Hex blobs. Please see comments and code below for more details. 
# These can be broken pretty easy but the script currently detects 97% of the .RTF samples on 
# contagiodump.  Out of 169 random .RTFs found via Google and FTP searches there was 1 FP. The
# FP was caused possibly by Unicode text. There is no error handling. Just make sure the file is a
# .RTF and the script has read writes. 
# Written by
# usage: 
# For scanning a RTF document " <bad.rtf>"
# For scanning a working dir "" 

import sys
import os
import re
import string
import math 

def check_header(fi):
    # Checks for the RTF header '\rt' in the file 
    # Non-RTF files will give false positives 
      f = open(fi,'rb')
      block =
      if '\\rt' not in block:
            print "Warning: Header not found in %s  Not an .RTF document" % fi
            print '\t',

def H(data):
    # calculates the entropy of a block of data
    # from Ero's blog
      if not data:
          return 0
      entropy = 0
      for x in range(256):
          p_x = float(data.count(chr(x)))/len(data)
          if p_x > 0:
              entropy += - p_x*math.log(p_x, 2)
      return entropy

def shell_ent(fi):
        index = 0
        block_size = 128
        tmp = 0
        inc = 0 
        with open(fi,'rb') as f:
                data =
                while(data != ''):
                    m = ''
                    # Search for blobs of data that are valid hex [a-fA-F0-9]
                    m ='[a-fA-F0-9]{128}',data)
                    if m:
                    # ASCII HEX shellcode has consistent entropy between 3.6 and 4.0
                    # We can use the entropy to detect shellcode in files that do not
                    # contain non-ASCII values. Commonly seen in shellcode that does 
                    # not drop a file but downloads and executes a file. 
                        entropy = H(data)
                        if 4.0 > entropy > 3.6:
                                if tmp == index - 16:
                                        inc = inc + 1
                                if inc == 16:
                                        print "Suspicious: shellcode entropy block at %s in %s" % (hex(index),fi)
                                #print hex(index), entropy, inc
                        inc = 0
                    data =
                    tmp = index 
                    index = index + 16


def valid_ascii(char):
        # Check if valid ASCII 
        if char in string.printable[:-3] + '\x0d':
                return True
                return None 

def check_bytes(file_):
        # Counts the amount of non-ASCII bytes are in a file
        count = 0
        with open(file_,'rb') as f:
                byte =
                while byte != '':
                        if valid_ascii(byte) == None:
                            count = count + 1
                        byte =
                        if count > 10000:
                                print "Suspicious: large amounts of non-ASCII chars %s" % file_
                                return True
        return False

def main():
        if len(sys.argv) == 2:
                if check_bytes(sys.argv[1]) != True:
                for infile in os.listdir(os.getcwd()):
                        if check_bytes(infile) != True:
if __name__ == '__main__':


No comments:

Post a Comment