Got from https://sourceforge.net/tracker/index.php?func=detail&aid=891491&group_id=103&atid=300103 [ 891491 ] Scrubber.py patch (file 77903: scrubber.patch 2004-02-25) You can apply it to an installed Mailman-System, but you should apply the Defaults.py.in patch to Defaults.py in that case. Index: mailman/Mailman/Defaults.py.in =================================================================== RCS file: /cvsroot/mailman/mailman/Mailman/Defaults.py.in,v retrieving revision 2.112.2.14 diff -u -r2.112.2.14 Defaults.py.in --- mailman/Mailman/Defaults.py.in 22 Feb 2004 22:20:51 -0000 2.112.2.14 +++ mailman/Mailman/Defaults.py.in 25 Feb 2004 08:14:37 -0000 @@ -256,6 +256,11 @@ # should modify the Message object as necessary. ARCHIVE_SCRUBBER = 'Mailman.Handlers.Scrubber' +# Mailman.Handlers.Scrubber uses attachment's filename as is. +# If you don't like this (extremely long mime-encoded filename) then set +# this True. +SCRUBBER_DONT_USE_ATTACHMENT_FILENAME = False + # This variable defines what happens to text/html subparts. They can be # stripped completely, escaped, or filtered through an external program. The # legal values are: Index: mailman/Mailman/Handlers/Scrubber.py =================================================================== RCS file: /cvsroot/mailman/mailman/Mailman/Handlers/Scrubber.py,v retrieving revision 2.18.2.6 diff -u -r2.18.2.6 Scrubber.py --- mailman/Mailman/Handlers/Scrubber.py 1 Dec 2003 01:43:18 -0000 2.18.2.6 +++ mailman/Mailman/Handlers/Scrubber.py 25 Feb 2004 08:14:37 -0000 @@ -27,7 +27,7 @@ import binascii import tempfile from cStringIO import StringIO -from types import IntType +from types import IntType, StringType from email.Utils import parsedate from email.Parser import HeaderParser @@ -180,11 +180,29 @@ # message. if charset is None: charset = part.get_content_charset(lcset) + # TK: if part is attached then check charset and scrub if none + if part.get('content-disposition') and \ + not part.get_content_charset(): + omask = os.umask(002) + try: + url = save_attachment(mlist, part, dir) + finally: + os.umask(omask) + filename = part.get_filename(_('not available')) + filename = Utils.oneline(filename, lcset) + del part['content-type'] + del part['content-transfer-encoding'] + part.set_payload(_("""\ +An embedded and charset-unspecified text was scrubbed... +Name: %(filename)s +Url: %(url)s +"""), lcset) elif ctype == 'text/html' and isinstance(sanitize, IntType): if sanitize == 0: if outer: raise DiscardMessage del part['content-type'] + del part['content-transfer-encoding'] part.set_payload(_('HTML attachment scrubbed and removed'), # Adding charset arg and removing content-tpe # sets content-type to text/plain @@ -202,6 +220,7 @@ finally: os.umask(omask) del part['content-type'] + del part['content-transfer-encoding'] part.set_payload(_("""\ An HTML attachment was scrubbed... URL: %(url)s @@ -267,6 +286,7 @@ os.umask(omask) desc = part.get('content-description', _('not available')) filename = part.get_filename(_('not available')) + filename = Utils.oneline(filename, lcset) del part['content-type'] del part['content-transfer-encoding'] part.set_payload(_("""\ @@ -285,7 +305,7 @@ # By default we take the charset of the first text/plain part in the # message, but if there was none, we'll use the list's preferred # language's charset. - if charset is None or charset == 'us-ascii': + if not charset or charset == 'us-ascii': charset = lcset # We now want to concatenate all the parts which have been scrubbed to # text/plain, into a single text/plain payload. We need to make sure @@ -294,17 +314,27 @@ # BAW: Martin's original patch suggested we might want to try # generalizing to utf-8, and that's probably a good idea (eventually). text = [] - for part in msg.get_payload(): + for part in msg.walk(): + if part.get_content_maintype() == 'multipart': + continue # All parts should be scrubbed to text/plain by now. partctype = part.get_content_type() if partctype <> 'text/plain': - text.append(_('Skipped content of type %(partctype)s')) + text.append(_('Skipped content of type %(partctype)s\n')) continue try: t = part.get_payload(decode=True) except binascii.Error: t = part.get_payload() - partcharset = part.get_content_charset() + # TK: get_content_charset() returns 'iso-2022-jp' for internally + # crafted (scrubbed) 'euc-jp' text part. So, first try + # get_charset(), then get_content_charset() for the parts + # which are already embeded in the incoming message. + partcharset = part.get_charset() + if partcharset: + partcharset = str(partcharset) + else: + partcharset = part.get_content_charset() if partcharset and partcharset <> charset: try: t = unicode(t, partcharset, 'replace') @@ -320,9 +350,10 @@ except (UnicodeError, LookupError, ValueError): t = t.encode(lcset, 'replace') # Separation is useful - if not t.endswith('\n'): - t += '\n' - text.append(t) + if isinstance(t, StringType): + if not t.endswith('\n'): + t += '\n' + text.append(t) # Now join the text and set the payload sep = _('-------------- next part --------------\n') del msg['content-type'] @@ -376,7 +407,7 @@ # Now base the filename on what's in the attachment, uniquifying it if # necessary. filename = msg.get_filename() - if not filename: + if not filename or mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME: filebase = 'attachment' else: # Sanitize the filename given in the message headers