/**
* MailArchiver is an application that provides services for storing and managing e-mail messages through a Web Services SOAP interface.
* Copyright (C) 2012 Marcio Andre Scholl Levien and Fernando Alberto Reuter Wendt and Jose Ronaldo Nogueira Fonseca Junior
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
/******************************************************************************\
*
* This product was developed by
*
* SERVIÇO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO),
*
* a government company established under Brazilian law (5.615/70),
* at Department of Development of Porto Alegre.
*
\******************************************************************************/
package serpro.mailarchiver.domain.metaarchive;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.jdo.JDOHelper;
import javax.jdo.annotations.NotPersistent;
import javax.jdo.annotations.PersistenceCapable;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.Tag;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.mutable.MutableBoolean;
import org.apache.james.mime4j.util.CharsetUtil;
import serpro.mailarchiver.util.BodyVisitor;
import serpro.mailarchiver.util.Charsets;
import serpro.mailarchiver.util.Logger;
@PersistenceCapable
public class TextBody
extends SingleBody
{
@NotPersistent
private static final Logger log = Logger.getLocalLogger();
@NotPersistent private static final String CR = "\015";
@NotPersistent private static final String LF = "\012";
@NotPersistent private static final String CRLF = "\015\012";
//**** P E R S I S T E N T ****
private String preview;
//*****************************
public final String getPreview() {
return preview;
}
public final void setPreview(String preview) {
this.preview = preview;
}
//--------------------------------------------------------------------------
@Override
final String toString(String pad) {
return String.format(
"TextBody%n"
+ "%1$sjdoState: %2$s%n"
+ "%1$soid: %3$s%n"
+ "%1$shash: %4$x%n"
+ "%1$soffset: %5$d%n"
+ "%1$slength: %6$d%n"
+ "%1$ssize: %7$d%n"
+ "%1$spreview: %8$s"
, pad
, JDOHelper.getObjectState(this)
, getOid()
, hashCode()
, getOffset()
, getLength()
, getSize()
, getPreview());
}
//--------------------------------------------------------------------------
public String getText() throws IOException {
Entity entity = getEntity();
if(entity != null) {
ContentTypeField contentTypeField = entity.getContentTypeField();
Charset cs = null;
if(contentTypeField != null) {
cs = CharsetUtil.lookup(contentTypeField.getCharset());
}
if(cs == null) {
cs = Charsets.Windows_1252;
}
InputStream is = getDecoderInputStream();
InputStreamReader isr = new InputStreamReader(is, cs);
BufferedReader reader = new BufferedReader(isr);
StringBuilder sb = new StringBuilder();
char[] cbuf = new char[0x1000];
int len;
while((len = reader.read(cbuf)) > 0) {
sb.append(cbuf, 0, len);
}
reader.close();
isr.close();
is.close();
return sb.toString();
}
throw new IllegalStateException();
}
public String getAdaptedText() throws IOException {
String text = getText();
Entity entity = getEntity();
if(entity != null) {
ContentTypeField contentTypeField = entity.getContentTypeField();
if((contentTypeField == null) || (contentTypeField.isTextPlainMimeType())) {
text = text.replaceAll(CRLF + "|" + CR + "|" + LF, "
" + LF);
}
}
text = StringEscapeUtils.unescapeHtml4(text);
text = replaceMailToRef(text);
text = replaceContentIdRef(text);
text = replaceUrlRef(text);
return text;
}
//--------------------------------------------------------------------------
@NotPersistent
private static final String contentIdRegex = "src\\s*=(\\.*(?!cid))(" +
"'\\s*cid:\\s*([^'\\s<>]*)\\s*'" +
"|" +
"\"\\s*cid:\\s*([^\"\\s<>]*)\\s*\"" +
"|" +
"cid:\\s*([^\\s<>]+)" +
")";
@NotPersistent
private static final Pattern contentIdPattern = Pattern.compile(contentIdRegex, Pattern.CASE_INSENSITIVE);
private String replaceContentIdRef(String text) {
Matcher m = contentIdPattern.matcher(text);
final StringBuilder sb = new StringBuilder();
int lastEnd = 0;
while(m.find()) {
sb.append(text.substring(lastEnd, m.start()))
.append("src=\"");
final String cid =
m.group(4) != null ? m.group(4) :
m.group(5) != null ? m.group(5) :
m.group(6) != null ? m.group(6) :
null;
final MutableBoolean binaryBodyFound = new MutableBoolean(false);
getRootMessage().visitBodies(new BodyVisitor() {
@Override
public void visitBinaryBody(BinaryBody binaryBody) {
Entity entity = binaryBody.getEntity();
UnstructuredField contentIdField = entity.getContentIdField();
if(contentIdField != null) {
if(contentIdField.getText().equalsIgnoreCase(cid)) {
sb.append("\" name=\"embedded_img_").append(binaryBody.getOid()).append("/").append(binaryBody.getFileName()).append("\" ");
binaryBodyFound.setValue(true);
quit();
}
}
}
});
if(binaryBodyFound.isFalse()) {
sb.append("cid:").append(cid).append("\" ");
}
lastEnd = m.end();
}
sb.append(text.substring(lastEnd));
return sb.toString();
}
//--------------------------------------------------------------------------
@NotPersistent
private static final String mailToRegex = "href\\s*=\\s*(" +
"'\\s*mailto:\\s*([^'\\s<>]*)\\s*'" +
"|" +
"\"\\s*mailto:\\s*([^\"\\s<>]*)\\s*\"" +
"|" +
"mailto:\\s*([^\\s<>]+)" +
")";
@NotPersistent
private static final Pattern mailToPattern = Pattern.compile(mailToRegex, Pattern.CASE_INSENSITIVE);
private String replaceMailToRef(String text) {
Matcher m = mailToPattern.matcher(text);
StringBuilder sb = new StringBuilder();
int lastEnd = 0;
while(m.find()) {
sb.append(text.substring(lastEnd, m.start()))
.append("name=\"_mailto\" href=\"javascript:new_message_to('");
if(m.group(2) != null) {
sb.append(m.group(2));
}
else if(m.group(3) != null) {
sb.append(m.group(3));
}
else if(m.group(4) != null) {
sb.append(m.group(4));
}
sb.append("');\"");
lastEnd = m.end();
}
sb.append(text.substring(lastEnd));
return sb.toString();
}
//--------------------------------------------------------------------------
@NotPersistent
private static final String urlRegex = "\\b(" +
"(?:(https?://)|www\\d{0,3}[.]|[a-z0-9.\\-]+[.](?=(?:com|org|net|gov|mil|info)(?:[.]br)?)|[a-z0-9.\\-]+[.][a-z]{2,4}/)" +
"(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+" +
"(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>\\?«»“”‘’]))";
@NotPersistent
private static final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE);
private String replaceUrlRef(String text) {
Source source = new Source(text);
source.fullSequentialParse();
Matcher m = urlPattern.matcher(text);
StringBuilder sb = new StringBuilder();
int lastEnd = 0;
find:
while(m.find()) {
Element element = source.getEnclosingElement(m.start());
while(element != null) {
String elementName = element.getName();
if(elementName == HTMLElementName.A
|| elementName == HTMLElementName.LINK
|| elementName == HTMLElementName.AREA
|| elementName == HTMLElementName.BASE
|| elementName == HTMLElementName.META
|| elementName == HTMLElementName.SCRIPT
|| elementName == HTMLElementName.IMG
|| elementName == HTMLElementName.VIDEO
|| elementName == HTMLElementName.AUDIO
|| elementName == HTMLElementName.SOURCE
|| elementName == HTMLElementName.BLOCKQUOTE
|| elementName == HTMLElementName.DEL
|| elementName == HTMLElementName.INS
|| elementName == HTMLElementName.Q
|| elementName == HTMLElementName.BUTTON
|| elementName == HTMLElementName.INPUT
|| elementName == HTMLElementName.OBJECT
|| elementName == HTMLElementName.EMBED
|| elementName == HTMLElementName.COMMAND)
{
continue find;
}
Tag startTag = element.getStartTag();
if((startTag.getBegin() < m.start()) && (m.start() < startTag.getEnd())) {
continue find;
}
element = element.getParentElement();
}
sb.append(text.substring(lastEnd, m.start()))
.append("").append(m.group(1)).append("");
lastEnd = m.end();
}
sb.append(text.substring(lastEnd));
return sb.toString();
}
}