/** * MailArchiver is an application that provides services for storing and managing e-mail messages through a Web Services SOAP interface. * Copyright (C) 2012 Marcio Andre Scholl Levien and Fernando Alberto Reuter Wendt and Jose Ronaldo Nogueira Fonseca Junior * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ /******************************************************************************\ * * This product was developed by * * SERVIÇO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO), * * a government company established under Brazilian law (5.615/70), * at Department of Development of Porto Alegre. * \******************************************************************************/ package serpro.mailarchiver.domain.metaarchive; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.jdo.JDOHelper; import javax.jdo.annotations.NotPersistent; import javax.jdo.annotations.PersistenceCapable; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import net.htmlparser.jericho.Tag; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.lang3.mutable.MutableBoolean; import org.apache.james.mime4j.util.CharsetUtil; import serpro.mailarchiver.util.BodyVisitor; import serpro.mailarchiver.util.Charsets; import serpro.mailarchiver.util.Logger; @PersistenceCapable public class TextBody extends SingleBody { @NotPersistent private static final Logger log = Logger.getLocalLogger(); @NotPersistent private static final String CR = "\015"; @NotPersistent private static final String LF = "\012"; @NotPersistent private static final String CRLF = "\015\012"; //**** P E R S I S T E N T **** private String preview; //***************************** public final String getPreview() { return preview; } public final void setPreview(String preview) { this.preview = preview; } //-------------------------------------------------------------------------- @Override final String toString(String pad) { return String.format( "TextBody%n" + "%1$sjdoState: %2$s%n" + "%1$soid: %3$s%n" + "%1$shash: %4$x%n" + "%1$soffset: %5$d%n" + "%1$slength: %6$d%n" + "%1$ssize: %7$d%n" + "%1$spreview: %8$s" , pad , JDOHelper.getObjectState(this) , getOid() , hashCode() , getOffset() , getLength() , getSize() , getPreview()); } //-------------------------------------------------------------------------- public String getText() throws IOException { Entity entity = getEntity(); if(entity != null) { ContentTypeField contentTypeField = entity.getContentTypeField(); Charset cs = null; if(contentTypeField != null) { cs = CharsetUtil.lookup(contentTypeField.getCharset()); } if(cs == null) { cs = Charsets.Windows_1252; } InputStream is = getDecoderInputStream(); InputStreamReader isr = new InputStreamReader(is, cs); BufferedReader reader = new BufferedReader(isr); StringBuilder sb = new StringBuilder(); char[] cbuf = new char[0x1000]; int len; while((len = reader.read(cbuf)) > 0) { sb.append(cbuf, 0, len); } reader.close(); isr.close(); is.close(); return sb.toString(); } throw new IllegalStateException(); } public String getAdaptedText() throws IOException { String text = getText(); Entity entity = getEntity(); if(entity != null) { ContentTypeField contentTypeField = entity.getContentTypeField(); if((contentTypeField == null) || (contentTypeField.isTextPlainMimeType())) { text = text.replaceAll(CRLF + "|" + CR + "|" + LF, "
" + LF); } } text = StringEscapeUtils.unescapeHtml4(text); text = replaceMailToRef(text); text = replaceContentIdRef(text); text = replaceUrlRef(text); return text; } //-------------------------------------------------------------------------- @NotPersistent private static final String contentIdRegex = "src\\s*=(\\.*(?!cid))(" + "'\\s*cid:\\s*([^'\\s<>]*)\\s*'" + "|" + "\"\\s*cid:\\s*([^\"\\s<>]*)\\s*\"" + "|" + "cid:\\s*([^\\s<>]+)" + ")"; @NotPersistent private static final Pattern contentIdPattern = Pattern.compile(contentIdRegex, Pattern.CASE_INSENSITIVE); private String replaceContentIdRef(String text) { Matcher m = contentIdPattern.matcher(text); final StringBuilder sb = new StringBuilder(); int lastEnd = 0; while(m.find()) { sb.append(text.substring(lastEnd, m.start())) .append("src=\""); final String cid = m.group(4) != null ? m.group(4) : m.group(5) != null ? m.group(5) : m.group(6) != null ? m.group(6) : null; final MutableBoolean binaryBodyFound = new MutableBoolean(false); getRootMessage().visitBodies(new BodyVisitor() { @Override public void visitBinaryBody(BinaryBody binaryBody) { Entity entity = binaryBody.getEntity(); UnstructuredField contentIdField = entity.getContentIdField(); if(contentIdField != null) { if(contentIdField.getText().equalsIgnoreCase(cid)) { sb.append("\" name=\"embedded_img_").append(binaryBody.getOid()).append("/").append(binaryBody.getFileName()).append("\" "); binaryBodyFound.setValue(true); quit(); } } } }); if(binaryBodyFound.isFalse()) { sb.append("cid:").append(cid).append("\" "); } lastEnd = m.end(); } sb.append(text.substring(lastEnd)); return sb.toString(); } //-------------------------------------------------------------------------- @NotPersistent private static final String mailToRegex = "href\\s*=\\s*(" + "'\\s*mailto:\\s*([^'\\s<>]*)\\s*'" + "|" + "\"\\s*mailto:\\s*([^\"\\s<>]*)\\s*\"" + "|" + "mailto:\\s*([^\\s<>]+)" + ")"; @NotPersistent private static final Pattern mailToPattern = Pattern.compile(mailToRegex, Pattern.CASE_INSENSITIVE); private String replaceMailToRef(String text) { Matcher m = mailToPattern.matcher(text); StringBuilder sb = new StringBuilder(); int lastEnd = 0; while(m.find()) { sb.append(text.substring(lastEnd, m.start())) .append("name=\"_mailto\" href=\"javascript:new_message_to('"); if(m.group(2) != null) { sb.append(m.group(2)); } else if(m.group(3) != null) { sb.append(m.group(3)); } else if(m.group(4) != null) { sb.append(m.group(4)); } sb.append("');\""); lastEnd = m.end(); } sb.append(text.substring(lastEnd)); return sb.toString(); } //-------------------------------------------------------------------------- @NotPersistent private static final String urlRegex = "\\b(" + "(?:(https?://)|www\\d{0,3}[.]|[a-z0-9.\\-]+[.](?=(?:com|org|net|gov|mil|info)(?:[.]br)?)|[a-z0-9.\\-]+[.][a-z]{2,4}/)" + "(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+" + "(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>\\?«»“”‘’]))"; @NotPersistent private static final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); private String replaceUrlRef(String text) { Source source = new Source(text); source.fullSequentialParse(); Matcher m = urlPattern.matcher(text); StringBuilder sb = new StringBuilder(); int lastEnd = 0; find: while(m.find()) { Element element = source.getEnclosingElement(m.start()); while(element != null) { String elementName = element.getName(); if(elementName == HTMLElementName.A || elementName == HTMLElementName.LINK || elementName == HTMLElementName.AREA || elementName == HTMLElementName.BASE || elementName == HTMLElementName.META || elementName == HTMLElementName.SCRIPT || elementName == HTMLElementName.IMG || elementName == HTMLElementName.VIDEO || elementName == HTMLElementName.AUDIO || elementName == HTMLElementName.SOURCE || elementName == HTMLElementName.BLOCKQUOTE || elementName == HTMLElementName.DEL || elementName == HTMLElementName.INS || elementName == HTMLElementName.Q || elementName == HTMLElementName.BUTTON || elementName == HTMLElementName.INPUT || elementName == HTMLElementName.OBJECT || elementName == HTMLElementName.EMBED || elementName == HTMLElementName.COMMAND) { continue find; } Tag startTag = element.getStartTag(); if((startTag.getBegin() < m.start()) && (m.start() < startTag.getEnd())) { continue find; } element = element.getParentElement(); } sb.append(text.substring(lastEnd, m.start())) .append("").append(m.group(1)).append(""); lastEnd = m.end(); } sb.append(text.substring(lastEnd)); return sb.toString(); } }