| | Author: Michael Slusarz | | Author: Aleksander Machniak | +-----------------------------------------------------------------------+ */ /** * MS-TNEF format decoder based on code by: * Graham Norbury * Original design by: * Thomas Boll , Mark Simpson * * @package Framework * @subpackage Storage */ class rcube_tnef_decoder { const SIGNATURE = 0x223e9f78; const LVL_MESSAGE = 0x01; const LVL_ATTACHMENT = 0x02; const AFROM = 0x08000; const ASUBJECT = 0x18004; const AMESSAGEID = 0x18009; const AFILENAME = 0x18010; const APARENTID = 0x1800a; const ACONVERSATIONID = 0x1800b; const ABODY = 0x2800c; const ADATESENT = 0x38005; const ADATERECEIVED = 0x38006; const ADATEMODIFIED = 0x38020; const APRIORITY = 0x4800d; const AOWNER = 0x60000; const ASENTFOR = 0x60001; const ASTATUS = 0x68007; const ATTACHDATA = 0x6800f; const ATTACHMETAFILE = 0x68011; const ATTACHCREATEDATE = 0x38012; const ARENDDATA = 0x69002; const AMAPIPROPS = 0x69003; const ARECIPIENTTABLE = 0x69004; const AMAPIATTRS = 0x69005; const AOEMCODEPAGE = 0x69007; const AORIGINALMCLASS = 0x70006; const AMCLASS = 0x78008; const AVERSION = 0x89006; const MAPI_TYPE_UNSET = 0x0000; const MAPI_NULL = 0x0001; const MAPI_SHORT = 0x0002; const MAPI_INT = 0x0003; const MAPI_FLOAT = 0x0004; const MAPI_DOUBLE = 0x0005; const MAPI_CURRENCY = 0x0006; const MAPI_APPTIME = 0x0007; const MAPI_ERROR = 0x000a; const MAPI_BOOLEAN = 0x000b; const MAPI_OBJECT = 0x000d; const MAPI_INT8BYTE = 0x0014; const MAPI_STRING = 0x001e; const MAPI_UNICODE_STRING = 0x001f; const MAPI_SYSTIME = 0x0040; const MAPI_CLSID = 0x0048; const MAPI_BINARY = 0x0102; const MAPI_BODY = 0x1000; const MAPI_RTF_COMPRESSED = 0x1009; const MAPI_BODY_HTML = 0x1013; const MAPI_NATIVE_BODY = 0x1016; const MAPI_DISPLAY_NAME = 0x3001; const MAPI_ADDRTYPE = 0x3002; const MAPI_EMAIL_ADDRESS = 0x3003; const MAPI_COMMENT = 0x3004; const MAPI_DEPTH = 0x3005; const MAPI_PROVIDER_DISPLAY = 0x3006; const MAPI_CREATION_TIME = 0x3007; const MAPI_LAST_MODIFICATION_TIME = 0x3008; const MAPI_RESOURCE_FLAGS = 0x3009; const MAPI_PROVIDER_DLL_NAME = 0x300A; const MAPI_SEARCH_KEY = 0x300B; const MAPI_ATTACHMENT_X400_PARAMETERS = 0x3700; const MAPI_ATTACH_DATA = 0x3701; const MAPI_ATTACH_ENCODING = 0x3702; const MAPI_ATTACH_EXTENSION = 0x3703; const MAPI_ATTACH_FILENAME = 0x3704; const MAPI_ATTACH_METHOD = 0x3705; const MAPI_ATTACH_LONG_FILENAME = 0x3707; const MAPI_ATTACH_PATHNAME = 0x3708; const MAPI_ATTACH_RENDERING = 0x3709; const MAPI_ATTACH_TAG = 0x370A; const MAPI_RENDERING_POSITION = 0x370B; const MAPI_ATTACH_TRANSPORT_NAME = 0x370C; const MAPI_ATTACH_LONG_PATHNAME = 0x370D; const MAPI_ATTACH_MIME_TAG = 0x370E; const MAPI_ATTACH_ADDITIONAL_INFO = 0x370F; const MAPI_ATTACH_MIME_SEQUENCE = 0x3710; const MAPI_ATTACH_CONTENT_ID = 0x3712; const MAPI_ATTACH_CONTENT_LOCATION = 0x3713; const MAPI_ATTACH_FLAGS = 0x3714; const MAPI_NAMED_TYPE_ID = 0x0000; const MAPI_NAMED_TYPE_STRING = 0x0001; const MAPI_NAMED_TYPE_NONE = 0xff; const MAPI_MV_FLAG = 0x1000; const RTF_UNCOMPRESSED = 0x414c454d; const RTF_COMPRESSED = 0x75465a4c; /** * Decompress the data. * * @param string $data The data to decompress. * * @return mixed The decompressed data. */ public function decompress($data) { $attachments = array(); $message = array(); if ($this->_geti($data, 32) == self::SIGNATURE) { $this->_geti($data, 16); // Version $this->_geti($data, 8); // lvl_message $this->_geti($data, 32); // idTnefVersion $this->_getx($data, $this->_geti($data, 32)); $this->_geti($data, 16); // checksum while (strlen($data) > 0) { switch ($this->_geti($data, 8)) { case self::LVL_MESSAGE: $this->_decodeMessage($data, $message); break; case self::LVL_ATTACHMENT: $this->_decodeAttachment($data, $attachments); break; } } } return array( 'message' => $message, 'attachments' => array_reverse($attachments), ); } /** * Pop specified number of bytes from the buffer. * * @param string &$data The data string. * @param integer $bytes How many bytes to retrieve. * * @return string Extracted data */ protected function _getx(&$data, $bytes) { $value = null; if (strlen($data) >= $bytes) { $value = substr($data, 0, $bytes); $data = substr($data, $bytes); } return $value; } /** * Pop specified number of bits from the buffer * * @param string &$data The data string. * @param integer $bits How many bits to retrieve. * * @return int */ protected function _geti(&$data, $bits) { $bytes = $bits / 8; $value = null; if (strlen($data) >= $bytes) { $value = ord($data[0]); if ($bytes >= 2) { $value += (ord($data[1]) << 8); } if ($bytes >= 4) { $value += (ord($data[2]) << 16) + (ord($data[3]) << 24); } $data = substr($data, $bytes); } return $value; } /** * Decode a single attribute * * @param string &$data The data string. */ protected function _decodeAttribute(&$data) { // Data. $value = $this->_getx($data, $this->_geti($data, 32)); // Checksum. $this->_geti($data, 16); return $value; } /** * TODO * * @param string $data The data string. * @param array &result TODO */ protected function _extractMapiAttributes($data, &$result) { // Number of attributes. $number = $this->_geti($data, 32); while ((strlen($data) > 0) && $number--) { $have_mval = false; $num_mval = 1; $value = null; $attr_type = $this->_geti($data, 16); $attr_name = $this->_geti($data, 16); if (($attr_type & self::MAPI_MV_FLAG) != 0) { $have_mval = true; $attr_type = $attr_type & ~self::MAPI_MV_FLAG; } if (($attr_name >= 0x8000) && ($attr_name < 0xFFFE)) { $this->_getx($data, 16); $named_type = $this->_geti($data, 32); switch ($named_type) { case self::MAPI_NAMED_TYPE_ID: $attr_name = $this->_geti($data, 32); break; case self::MAPI_NAMED_TYPE_STRING: $attr_name = 0x9999; $idlen = $this->_geti($data, 32); $name = $this->_getx($data, $idlen + ((4 - ($idlen % 4)) % 4)); // $name = $this->convertString(substr($name, 0, $idlen)); break; case self::MAPI_NAMED_TYPE_NONE: default: continue 2; } } if ($have_mval) { $num_mval = $this->_geti($data, 32); } switch ($attr_type) { case self::MAPI_NULL: case self::MAPI_TYPE_UNSET: break; case self::MAPI_SHORT: $value = $this->_geti($data, 16); $this->_geti($data, 16); break; case self::MAPI_INT: case self::MAPI_BOOLEAN: for ($i = 0; $i < $num_mval; $i++) { $value = $this->_geti($data, 32); } break; case self::MAPI_FLOAT: case self::MAPI_ERROR: $value = $this->_getx($data, 4); break; case self::MAPI_DOUBLE: case self::MAPI_APPTIME: case self::MAPI_CURRENCY: case self::MAPI_INT8BYTE: case self::MAPI_SYSTIME: $value = $this->_getx($data, 8); break; case self::MAPI_STRING: case self::MAPI_UNICODE_STRING: case self::MAPI_BINARY: case self::MAPI_OBJECT: $num_vals = $have_mval ? $num_mval : $this->_geti($data, 32); for ($i = 0; $i < $num_vals; $i++) { $length = $this->_geti($data, 32); // Pad to next 4 byte boundary. $datalen = $length + ((4 - ($length % 4)) % 4); // Read and truncate to length. $value = $this->_getx($data, $datalen); } if ($attr_type == self::MAPI_UNICODE_STRING) { $value = $this->convertString($value); } break; } // Store any interesting attributes. switch ($attr_name) { case self::MAPI_RTF_COMPRESSED: $result['type'] = 'application'; $result['subtype'] = 'rtf'; $result['name'] = ($result['name'] ?: 'Untitled') . '.rtf'; $result['stream'] = $this->_decodeRTF($value); $result['size'] = strlen($result['stream']); break; case self::MAPI_BODY: case self::MAPI_BODY_HTML: $result['type'] = 'text'; $result['subtype'] = $attr_name == self::MAPI_BODY ? 'plain' : 'html'; $result['name'] = ($result['name'] ?: 'Untitled') . ($attr_name == self::MAPI_BODY ? '.txt' : '.html'); $result['stream'] = $value; $result['size'] = strlen($value); break; case self::MAPI_ATTACH_LONG_FILENAME: // Used in preference to AFILENAME value. $result['name'] = trim(preg_replace('/.*[\/](.*)$/', '\1', $value)); break; case self::MAPI_ATTACH_MIME_TAG: // Is this ever set, and what is format? $value = explode('/', $value); $result['type'] = $value[0]; $result['subtype'] = $value[1]; break; case self::MAPI_ATTACH_DATA: $this->_getx($value, 16); $att = new rcube_tnef_decoder; $res = $att->decompress($value); $result = array_merge($result, $res['message']); break; } } } /** * Decodes TNEF message attributes * * @param string &$data The data string. * @param array &$message Message data */ protected function _decodeMessage(&$data, &$message) { $attribute = $this->_geti($data, 32); $value = $this->_decodeAttribute($data); switch ($attribute) { case self::AOEMCODEPAGE: // Find codepage of the message $value = unpack('V', $value); $this->codepage = $value[1]; break; case self::AMCLASS: $value = trim(str_replace('Microsoft Mail v3.0 ', '', $value)); // Normal message will be that with prefix 'IPM.Microsoft Mail. break; case self::ASUBJECT: $message['name'] = $value; break; case self::AMAPIPROPS: $this->_extractMapiAttributes($value, $message); break; } } /** * Decodes TNEF attachment attributes * * @param string &$data The data string. * @param array &$attachment Attachments data */ protected function _decodeAttachment(&$data, &$attachment) { $attribute = $this->_geti($data, 32); $size = $this->_geti($data, 32); $value = $this->_getx($data, $size); $this->_geti($data, 16); // checksum switch ($attribute) { case self::ARENDDATA: // Add a new default data block to hold details of this // attachment. Reverse order is easier to handle later! array_unshift($attachment, array( 'type' => 'application', 'subtype' => 'octet-stream', 'name' => 'unknown', 'stream' => '' )); break; case self::AFILENAME: $value = $this->convertString($value, true); // Strip path $attachment[0]['name'] = trim(preg_replace('/.*[\/](.*)$/', '\1', $value)); break; case self::ATTACHDATA: // The attachment itself $attachment[0]['size'] = $size; $attachment[0]['stream'] = $value; break; case self::AMAPIATTRS: $this->_extractMapiAttributes($value, $attachment[0]); break; } } /** * Convert string value to system charset according to defined codepage */ protected function convertString($str, $use_codepage = false) { if ($use_codepage && $this->codepage && ($charset = rcube_charset::$windows_codepages[$this->codepage]) ) { $str = rcube_charset::convert($str, $charset, RCUBE_CHARSET); } else if (($pos = strpos($str, "\0")) !== false && $pos != strlen($str)-1) { $str = rcube_charset::convert($str, 'UTF-16LE', RCUBE_CHARSET); } return trim($str); } /** * Decodes TNEF RTF */ protected function _decodeRTF($data) { $c_size = $this->_geti($data, 32); $size = $this->_geti($data, 32); $magic = $this->_geti($data, 32); $crc = $this->_geti($data, 32); if ($magic == self::RTF_COMPRESSED) { $data = $this->_decompressRTF($data, $size); } return $data; } /** * Decompress compressed RTF. Logic taken from Horde. */ protected function _decompressRTF($data, $size) { $in = $out = $flags = $flag_count = 0; $uncomp = ''; $preload = "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArialTimes New RomanCourier{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"; $length_preload = strlen($preload); for ($cnt = 0; $cnt < $length_preload; $cnt++) { $uncomp .= $preload{$cnt}; ++$out; } while ($out < ($size + $length_preload)) { if (($flag_count++ % 8) == 0) { $flags = ord($data{$in++}); } else { $flags = $flags >> 1; } if (($flags & 1) != 0) { $offset = ord($data{$in++}); $length = ord($data{$in++}); $offset = ($offset << 4) | ($length >> 4); $length = ($length & 0xF) + 2; $offset = ((int)($out / 4096)) * 4096 + $offset; if ($offset >= $out) { $offset -= 4096; } $end = $offset + $length; while ($offset < $end) { $uncomp.= $uncomp[$offset++]; ++$out; } } else { $uncomp .= $data{$in++}; ++$out; } } return substr($uncomp, $length_preload); } /** * Parse RTF data and return the best plaintext representation we can. * Adapted from: * http://webcheatsheet.com/php/reading_the_clean_text_from_rtf.php * * @param string $text The RTF (uncompressed) text. * * @return string The plain text. */ public static function rtf2text($text) { $document = ''; $stack = array(); $j = -1; // Read the data character-by- character… for ($i = 0, $len = strlen($text); $i < $len; $i++) { $c = $text[$i]; switch ($c) { case "\\": // Key Word $nextChar = $text[$i + 1]; // If it is another backslash or nonbreaking space or hyphen, // then the character is plain text and add it to the output stream. if ($nextChar == "\\" && self::_rtfIsPlain($stack[$j])) { $document .= "\\"; } elseif ($nextChar == '~' && self::_rtfIsPlain($stack[$j])) { $document .= ' '; } elseif ($nextChar == '_' && self::_rtfIsPlain($stack[$j])) { $document .= '-'; } elseif ($nextChar == '*') { // Add to the stack. $stack[$j]['*'] = true; } elseif ($nextChar == "'") { // If it is a single quote, read next two characters that // are the hexadecimal notation of a character we should add // to the output stream. $hex = substr($text, $i + 2, 2); if (self::_rtfIsPlain($stack[$j])) { $document .= html_entity_decode('&#' . hexdec($hex) .';'); } //Shift the pointer. $i += 2; } elseif ($nextChar >= 'a' && $nextChar <= 'z' || $nextChar >= 'A' && $nextChar <= 'Z') { // Since, we’ve found the alphabetic character, the next // characters are control words and, possibly, some digit // parameter. $word = ''; $param = null; // Start reading characters after the backslash. for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) { $nextChar = $text[$k]; // If the current character is a letter and there were // no digits before it, then we’re still reading the // control word. If there were digits, we should stop // since we reach the end of the control word. if ($nextChar >= 'a' && $nextChar <= 'z' || $nextChar >= 'A' && $nextChar <= 'Z') { if (!empty($param)) { break; } $word .= $nextChar; } elseif ($nextChar >= '0' && $nextChar <= '9') { // If it is a digit, store the parameter. $param .= $nextChar; } elseif ($nextChar == '-') { // Since minus sign may occur only before a digit // parameter, check whether $param is empty. // Otherwise, we reach the end of the control word. if (!empty($param)) { break; } $param .= $nextChar; } else { break; } } // Shift the pointer on the number of read characters. $i += $m - 1; // Start analyzing.We are interested mostly in control words $toText = ''; switch (strtolower($word)) { // If the control word is "u", then its parameter is // the decimal notation of the Unicode character that // should be added to the output stream. We need to // check whether the stack contains \ucN control word. // If it does, we should remove the N characters from // the output stream. case 'u': $toText .= html_entity_decode('&#x' . dechex($param) .';'); $ucDelta = @$stack[$j]['uc']; if ($ucDelta > 0) { $i += $ucDelta; } break; case 'par': case 'page': case 'column': case 'line': case 'lbr': $toText .= "\n"; break; case 'emspace': case 'enspace': case 'qmspace': $toText .= ' '; break; case 'tab': $toText .= "\t"; break; case 'chdate': $toText .= date('m.d.Y'); break; case 'chdpl': $toText .= date('l, j F Y'); break; case 'chdpa': $toText .= date('D, j M Y'); break; case 'chtime': $toText .= date('H:i:s'); break; case 'emdash': $toText .= html_entity_decode('—'); break; case 'endash': $toText .= html_entity_decode('–'); break; case 'bullet': $toText .= html_entity_decode('•'); break; case 'lquote': $toText .= html_entity_decode('‘'); break; case 'rquote': $toText .= html_entity_decode('’'); break; case 'ldblquote': $toText .= html_entity_decode('«'); break; case 'rdblquote': $toText .= html_entity_decode('»'); break; default: $stack[$j][strtolower($word)] = empty($param) ? true : $param; break; } // Add data to the output stream if required. if (self::_rtfIsPlain($stack[$j])) { $document .= $toText; } } $i++; break; case '{': // New subgroup starts, add new stack element and write the data // from previous stack element to it. if (!empty($stack[$j])) { array_push($stack, $stack[$j++]); } else { $j++; } break; case '}': array_pop($stack); $j--; break; case '\0': case '\r': case '\f': case '\n': // Junk break; default: // Add other data to the output stream if required. if (!empty($stack[$j]) && self::_rtfIsPlain($stack[$j])) { $document .= $c; } break; } } return $document; } protected static function _rtfIsPlain($s) { $notPlain = array('*', 'fonttbl', 'colortbl', 'datastore', 'themedata', 'stylesheet'); for ($i = 0; $i < count($notPlain); $i++) { if (!empty($s[$notPlain[$i]])) { return false; } } return true; } }