jpayne@69: # Copyright (C) 2002-2007 Python Software Foundation jpayne@69: # Contact: email-sig@python.org jpayne@69: jpayne@69: """Email address parsing code. jpayne@69: jpayne@69: Lifted directly from rfc822.py. This should eventually be rewritten. jpayne@69: """ jpayne@69: jpayne@69: __all__ = [ jpayne@69: 'mktime_tz', jpayne@69: 'parsedate', jpayne@69: 'parsedate_tz', jpayne@69: 'quote', jpayne@69: ] jpayne@69: jpayne@69: import time, calendar jpayne@69: jpayne@69: SPACE = ' ' jpayne@69: EMPTYSTRING = '' jpayne@69: COMMASPACE = ', ' jpayne@69: jpayne@69: # Parse a date field jpayne@69: _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', jpayne@69: 'aug', 'sep', 'oct', 'nov', 'dec', jpayne@69: 'january', 'february', 'march', 'april', 'may', 'june', 'july', jpayne@69: 'august', 'september', 'october', 'november', 'december'] jpayne@69: jpayne@69: _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] jpayne@69: jpayne@69: # The timezone table does not include the military time zones defined jpayne@69: # in RFC822, other than Z. According to RFC1123, the description in jpayne@69: # RFC822 gets the signs wrong, so we can't rely on any such time jpayne@69: # zones. RFC1123 recommends that numeric timezone indicators be used jpayne@69: # instead of timezone names. jpayne@69: jpayne@69: _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, jpayne@69: 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) jpayne@69: 'EST': -500, 'EDT': -400, # Eastern jpayne@69: 'CST': -600, 'CDT': -500, # Central jpayne@69: 'MST': -700, 'MDT': -600, # Mountain jpayne@69: 'PST': -800, 'PDT': -700 # Pacific jpayne@69: } jpayne@69: jpayne@69: jpayne@69: def parsedate_tz(data): jpayne@69: """Convert a date string to a time tuple. jpayne@69: jpayne@69: Accounts for military timezones. jpayne@69: """ jpayne@69: res = _parsedate_tz(data) jpayne@69: if not res: jpayne@69: return jpayne@69: if res[9] is None: jpayne@69: res[9] = 0 jpayne@69: return tuple(res) jpayne@69: jpayne@69: def _parsedate_tz(data): jpayne@69: """Convert date to extended time tuple. jpayne@69: jpayne@69: The last (additional) element is the time zone offset in seconds, except if jpayne@69: the timezone was specified as -0000. In that case the last element is jpayne@69: None. This indicates a UTC timestamp that explicitly declaims knowledge of jpayne@69: the source timezone, as opposed to a +0000 timestamp that indicates the jpayne@69: source timezone really was UTC. jpayne@69: jpayne@69: """ jpayne@69: if not data: jpayne@69: return jpayne@69: data = data.split() jpayne@69: # The FWS after the comma after the day-of-week is optional, so search and jpayne@69: # adjust for this. jpayne@69: if data[0].endswith(',') or data[0].lower() in _daynames: jpayne@69: # There's a dayname here. Skip it jpayne@69: del data[0] jpayne@69: else: jpayne@69: i = data[0].rfind(',') jpayne@69: if i >= 0: jpayne@69: data[0] = data[0][i+1:] jpayne@69: if len(data) == 3: # RFC 850 date, deprecated jpayne@69: stuff = data[0].split('-') jpayne@69: if len(stuff) == 3: jpayne@69: data = stuff + data[1:] jpayne@69: if len(data) == 4: jpayne@69: s = data[3] jpayne@69: i = s.find('+') jpayne@69: if i == -1: jpayne@69: i = s.find('-') jpayne@69: if i > 0: jpayne@69: data[3:] = [s[:i], s[i:]] jpayne@69: else: jpayne@69: data.append('') # Dummy tz jpayne@69: if len(data) < 5: jpayne@69: return None jpayne@69: data = data[:5] jpayne@69: [dd, mm, yy, tm, tz] = data jpayne@69: mm = mm.lower() jpayne@69: if mm not in _monthnames: jpayne@69: dd, mm = mm, dd.lower() jpayne@69: if mm not in _monthnames: jpayne@69: return None jpayne@69: mm = _monthnames.index(mm) + 1 jpayne@69: if mm > 12: jpayne@69: mm -= 12 jpayne@69: if dd[-1] == ',': jpayne@69: dd = dd[:-1] jpayne@69: i = yy.find(':') jpayne@69: if i > 0: jpayne@69: yy, tm = tm, yy jpayne@69: if yy[-1] == ',': jpayne@69: yy = yy[:-1] jpayne@69: if not yy[0].isdigit(): jpayne@69: yy, tz = tz, yy jpayne@69: if tm[-1] == ',': jpayne@69: tm = tm[:-1] jpayne@69: tm = tm.split(':') jpayne@69: if len(tm) == 2: jpayne@69: [thh, tmm] = tm jpayne@69: tss = '0' jpayne@69: elif len(tm) == 3: jpayne@69: [thh, tmm, tss] = tm jpayne@69: elif len(tm) == 1 and '.' in tm[0]: jpayne@69: # Some non-compliant MUAs use '.' to separate time elements. jpayne@69: tm = tm[0].split('.') jpayne@69: if len(tm) == 2: jpayne@69: [thh, tmm] = tm jpayne@69: tss = 0 jpayne@69: elif len(tm) == 3: jpayne@69: [thh, tmm, tss] = tm jpayne@69: else: jpayne@69: return None jpayne@69: try: jpayne@69: yy = int(yy) jpayne@69: dd = int(dd) jpayne@69: thh = int(thh) jpayne@69: tmm = int(tmm) jpayne@69: tss = int(tss) jpayne@69: except ValueError: jpayne@69: return None jpayne@69: # Check for a yy specified in two-digit format, then convert it to the jpayne@69: # appropriate four-digit format, according to the POSIX standard. RFC 822 jpayne@69: # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) jpayne@69: # mandates a 4-digit yy. For more information, see the documentation for jpayne@69: # the time module. jpayne@69: if yy < 100: jpayne@69: # The year is between 1969 and 1999 (inclusive). jpayne@69: if yy > 68: jpayne@69: yy += 1900 jpayne@69: # The year is between 2000 and 2068 (inclusive). jpayne@69: else: jpayne@69: yy += 2000 jpayne@69: tzoffset = None jpayne@69: tz = tz.upper() jpayne@69: if tz in _timezones: jpayne@69: tzoffset = _timezones[tz] jpayne@69: else: jpayne@69: try: jpayne@69: tzoffset = int(tz) jpayne@69: except ValueError: jpayne@69: pass jpayne@69: if tzoffset==0 and tz.startswith('-'): jpayne@69: tzoffset = None jpayne@69: # Convert a timezone offset into seconds ; -0500 -> -18000 jpayne@69: if tzoffset: jpayne@69: if tzoffset < 0: jpayne@69: tzsign = -1 jpayne@69: tzoffset = -tzoffset jpayne@69: else: jpayne@69: tzsign = 1 jpayne@69: tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) jpayne@69: # Daylight Saving Time flag is set to -1, since DST is unknown. jpayne@69: return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset] jpayne@69: jpayne@69: jpayne@69: def parsedate(data): jpayne@69: """Convert a time string to a time tuple.""" jpayne@69: t = parsedate_tz(data) jpayne@69: if isinstance(t, tuple): jpayne@69: return t[:9] jpayne@69: else: jpayne@69: return t jpayne@69: jpayne@69: jpayne@69: def mktime_tz(data): jpayne@69: """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" jpayne@69: if data[9] is None: jpayne@69: # No zone info, so localtime is better assumption than GMT jpayne@69: return time.mktime(data[:8] + (-1,)) jpayne@69: else: jpayne@69: t = calendar.timegm(data) jpayne@69: return t - data[9] jpayne@69: jpayne@69: jpayne@69: def quote(str): jpayne@69: """Prepare string to be used in a quoted string. jpayne@69: jpayne@69: Turns backslash and double quote characters into quoted pairs. These jpayne@69: are the only characters that need to be quoted inside a quoted string. jpayne@69: Does not add the surrounding double quotes. jpayne@69: """ jpayne@69: return str.replace('\\', '\\\\').replace('"', '\\"') jpayne@69: jpayne@69: jpayne@69: class AddrlistClass: jpayne@69: """Address parser class by Ben Escoto. jpayne@69: jpayne@69: To understand what this class does, it helps to have a copy of RFC 2822 in jpayne@69: front of you. jpayne@69: jpayne@69: Note: this class interface is deprecated and may be removed in the future. jpayne@69: Use email.utils.AddressList instead. jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, field): jpayne@69: """Initialize a new instance. jpayne@69: jpayne@69: `field' is an unparsed address header field, containing jpayne@69: one or more addresses. jpayne@69: """ jpayne@69: self.specials = '()<>@,:;.\"[]' jpayne@69: self.pos = 0 jpayne@69: self.LWS = ' \t' jpayne@69: self.CR = '\r\n' jpayne@69: self.FWS = self.LWS + self.CR jpayne@69: self.atomends = self.specials + self.LWS + self.CR jpayne@69: # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it jpayne@69: # is obsolete syntax. RFC 2822 requires that we recognize obsolete jpayne@69: # syntax, so allow dots in phrases. jpayne@69: self.phraseends = self.atomends.replace('.', '') jpayne@69: self.field = field jpayne@69: self.commentlist = [] jpayne@69: jpayne@69: def gotonext(self): jpayne@69: """Skip white space and extract comments.""" jpayne@69: wslist = [] jpayne@69: while self.pos < len(self.field): jpayne@69: if self.field[self.pos] in self.LWS + '\n\r': jpayne@69: if self.field[self.pos] not in '\n\r': jpayne@69: wslist.append(self.field[self.pos]) jpayne@69: self.pos += 1 jpayne@69: elif self.field[self.pos] == '(': jpayne@69: self.commentlist.append(self.getcomment()) jpayne@69: else: jpayne@69: break jpayne@69: return EMPTYSTRING.join(wslist) jpayne@69: jpayne@69: def getaddrlist(self): jpayne@69: """Parse all addresses. jpayne@69: jpayne@69: Returns a list containing all of the addresses. jpayne@69: """ jpayne@69: result = [] jpayne@69: while self.pos < len(self.field): jpayne@69: ad = self.getaddress() jpayne@69: if ad: jpayne@69: result += ad jpayne@69: else: jpayne@69: result.append(('', '')) jpayne@69: return result jpayne@69: jpayne@69: def getaddress(self): jpayne@69: """Parse the next address.""" jpayne@69: self.commentlist = [] jpayne@69: self.gotonext() jpayne@69: jpayne@69: oldpos = self.pos jpayne@69: oldcl = self.commentlist jpayne@69: plist = self.getphraselist() jpayne@69: jpayne@69: self.gotonext() jpayne@69: returnlist = [] jpayne@69: jpayne@69: if self.pos >= len(self.field): jpayne@69: # Bad email address technically, no domain. jpayne@69: if plist: jpayne@69: returnlist = [(SPACE.join(self.commentlist), plist[0])] jpayne@69: jpayne@69: elif self.field[self.pos] in '.@': jpayne@69: # email address is just an addrspec jpayne@69: # this isn't very efficient since we start over jpayne@69: self.pos = oldpos jpayne@69: self.commentlist = oldcl jpayne@69: addrspec = self.getaddrspec() jpayne@69: returnlist = [(SPACE.join(self.commentlist), addrspec)] jpayne@69: jpayne@69: elif self.field[self.pos] == ':': jpayne@69: # address is a group jpayne@69: returnlist = [] jpayne@69: jpayne@69: fieldlen = len(self.field) jpayne@69: self.pos += 1 jpayne@69: while self.pos < len(self.field): jpayne@69: self.gotonext() jpayne@69: if self.pos < fieldlen and self.field[self.pos] == ';': jpayne@69: self.pos += 1 jpayne@69: break jpayne@69: returnlist = returnlist + self.getaddress() jpayne@69: jpayne@69: elif self.field[self.pos] == '<': jpayne@69: # Address is a phrase then a route addr jpayne@69: routeaddr = self.getrouteaddr() jpayne@69: jpayne@69: if self.commentlist: jpayne@69: returnlist = [(SPACE.join(plist) + ' (' + jpayne@69: ' '.join(self.commentlist) + ')', routeaddr)] jpayne@69: else: jpayne@69: returnlist = [(SPACE.join(plist), routeaddr)] jpayne@69: jpayne@69: else: jpayne@69: if plist: jpayne@69: returnlist = [(SPACE.join(self.commentlist), plist[0])] jpayne@69: elif self.field[self.pos] in self.specials: jpayne@69: self.pos += 1 jpayne@69: jpayne@69: self.gotonext() jpayne@69: if self.pos < len(self.field) and self.field[self.pos] == ',': jpayne@69: self.pos += 1 jpayne@69: return returnlist jpayne@69: jpayne@69: def getrouteaddr(self): jpayne@69: """Parse a route address (Return-path value). jpayne@69: jpayne@69: This method just skips all the route stuff and returns the addrspec. jpayne@69: """ jpayne@69: if self.field[self.pos] != '<': jpayne@69: return jpayne@69: jpayne@69: expectroute = False jpayne@69: self.pos += 1 jpayne@69: self.gotonext() jpayne@69: adlist = '' jpayne@69: while self.pos < len(self.field): jpayne@69: if expectroute: jpayne@69: self.getdomain() jpayne@69: expectroute = False jpayne@69: elif self.field[self.pos] == '>': jpayne@69: self.pos += 1 jpayne@69: break jpayne@69: elif self.field[self.pos] == '@': jpayne@69: self.pos += 1 jpayne@69: expectroute = True jpayne@69: elif self.field[self.pos] == ':': jpayne@69: self.pos += 1 jpayne@69: else: jpayne@69: adlist = self.getaddrspec() jpayne@69: self.pos += 1 jpayne@69: break jpayne@69: self.gotonext() jpayne@69: jpayne@69: return adlist jpayne@69: jpayne@69: def getaddrspec(self): jpayne@69: """Parse an RFC 2822 addr-spec.""" jpayne@69: aslist = [] jpayne@69: jpayne@69: self.gotonext() jpayne@69: while self.pos < len(self.field): jpayne@69: preserve_ws = True jpayne@69: if self.field[self.pos] == '.': jpayne@69: if aslist and not aslist[-1].strip(): jpayne@69: aslist.pop() jpayne@69: aslist.append('.') jpayne@69: self.pos += 1 jpayne@69: preserve_ws = False jpayne@69: elif self.field[self.pos] == '"': jpayne@69: aslist.append('"%s"' % quote(self.getquote())) jpayne@69: elif self.field[self.pos] in self.atomends: jpayne@69: if aslist and not aslist[-1].strip(): jpayne@69: aslist.pop() jpayne@69: break jpayne@69: else: jpayne@69: aslist.append(self.getatom()) jpayne@69: ws = self.gotonext() jpayne@69: if preserve_ws and ws: jpayne@69: aslist.append(ws) jpayne@69: jpayne@69: if self.pos >= len(self.field) or self.field[self.pos] != '@': jpayne@69: return EMPTYSTRING.join(aslist) jpayne@69: jpayne@69: aslist.append('@') jpayne@69: self.pos += 1 jpayne@69: self.gotonext() jpayne@69: domain = self.getdomain() jpayne@69: if not domain: jpayne@69: # Invalid domain, return an empty address instead of returning a jpayne@69: # local part to denote failed parsing. jpayne@69: return EMPTYSTRING jpayne@69: return EMPTYSTRING.join(aslist) + domain jpayne@69: jpayne@69: def getdomain(self): jpayne@69: """Get the complete domain name from an address.""" jpayne@69: sdlist = [] jpayne@69: while self.pos < len(self.field): jpayne@69: if self.field[self.pos] in self.LWS: jpayne@69: self.pos += 1 jpayne@69: elif self.field[self.pos] == '(': jpayne@69: self.commentlist.append(self.getcomment()) jpayne@69: elif self.field[self.pos] == '[': jpayne@69: sdlist.append(self.getdomainliteral()) jpayne@69: elif self.field[self.pos] == '.': jpayne@69: self.pos += 1 jpayne@69: sdlist.append('.') jpayne@69: elif self.field[self.pos] == '@': jpayne@69: # bpo-34155: Don't parse domains with two `@` like jpayne@69: # `a@malicious.org@important.com`. jpayne@69: return EMPTYSTRING jpayne@69: elif self.field[self.pos] in self.atomends: jpayne@69: break jpayne@69: else: jpayne@69: sdlist.append(self.getatom()) jpayne@69: return EMPTYSTRING.join(sdlist) jpayne@69: jpayne@69: def getdelimited(self, beginchar, endchars, allowcomments=True): jpayne@69: """Parse a header fragment delimited by special characters. jpayne@69: jpayne@69: `beginchar' is the start character for the fragment. jpayne@69: If self is not looking at an instance of `beginchar' then jpayne@69: getdelimited returns the empty string. jpayne@69: jpayne@69: `endchars' is a sequence of allowable end-delimiting characters. jpayne@69: Parsing stops when one of these is encountered. jpayne@69: jpayne@69: If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed jpayne@69: within the parsed fragment. jpayne@69: """ jpayne@69: if self.field[self.pos] != beginchar: jpayne@69: return '' jpayne@69: jpayne@69: slist = [''] jpayne@69: quote = False jpayne@69: self.pos += 1 jpayne@69: while self.pos < len(self.field): jpayne@69: if quote: jpayne@69: slist.append(self.field[self.pos]) jpayne@69: quote = False jpayne@69: elif self.field[self.pos] in endchars: jpayne@69: self.pos += 1 jpayne@69: break jpayne@69: elif allowcomments and self.field[self.pos] == '(': jpayne@69: slist.append(self.getcomment()) jpayne@69: continue # have already advanced pos from getcomment jpayne@69: elif self.field[self.pos] == '\\': jpayne@69: quote = True jpayne@69: else: jpayne@69: slist.append(self.field[self.pos]) jpayne@69: self.pos += 1 jpayne@69: jpayne@69: return EMPTYSTRING.join(slist) jpayne@69: jpayne@69: def getquote(self): jpayne@69: """Get a quote-delimited fragment from self's field.""" jpayne@69: return self.getdelimited('"', '"\r', False) jpayne@69: jpayne@69: def getcomment(self): jpayne@69: """Get a parenthesis-delimited fragment from self's field.""" jpayne@69: return self.getdelimited('(', ')\r', True) jpayne@69: jpayne@69: def getdomainliteral(self): jpayne@69: """Parse an RFC 2822 domain-literal.""" jpayne@69: return '[%s]' % self.getdelimited('[', ']\r', False) jpayne@69: jpayne@69: def getatom(self, atomends=None): jpayne@69: """Parse an RFC 2822 atom. jpayne@69: jpayne@69: Optional atomends specifies a different set of end token delimiters jpayne@69: (the default is to use self.atomends). This is used e.g. in jpayne@69: getphraselist() since phrase endings must not include the `.' (which jpayne@69: is legal in phrases).""" jpayne@69: atomlist = [''] jpayne@69: if atomends is None: jpayne@69: atomends = self.atomends jpayne@69: jpayne@69: while self.pos < len(self.field): jpayne@69: if self.field[self.pos] in atomends: jpayne@69: break jpayne@69: else: jpayne@69: atomlist.append(self.field[self.pos]) jpayne@69: self.pos += 1 jpayne@69: jpayne@69: return EMPTYSTRING.join(atomlist) jpayne@69: jpayne@69: def getphraselist(self): jpayne@69: """Parse a sequence of RFC 2822 phrases. jpayne@69: jpayne@69: A phrase is a sequence of words, which are in turn either RFC 2822 jpayne@69: atoms or quoted-strings. Phrases are canonicalized by squeezing all jpayne@69: runs of continuous whitespace into one space. jpayne@69: """ jpayne@69: plist = [] jpayne@69: jpayne@69: while self.pos < len(self.field): jpayne@69: if self.field[self.pos] in self.FWS: jpayne@69: self.pos += 1 jpayne@69: elif self.field[self.pos] == '"': jpayne@69: plist.append(self.getquote()) jpayne@69: elif self.field[self.pos] == '(': jpayne@69: self.commentlist.append(self.getcomment()) jpayne@69: elif self.field[self.pos] in self.phraseends: jpayne@69: break jpayne@69: else: jpayne@69: plist.append(self.getatom(self.phraseends)) jpayne@69: jpayne@69: return plist jpayne@69: jpayne@69: class AddressList(AddrlistClass): jpayne@69: """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" jpayne@69: def __init__(self, field): jpayne@69: AddrlistClass.__init__(self, field) jpayne@69: if field: jpayne@69: self.addresslist = self.getaddrlist() jpayne@69: else: jpayne@69: self.addresslist = [] jpayne@69: jpayne@69: def __len__(self): jpayne@69: return len(self.addresslist) jpayne@69: jpayne@69: def __add__(self, other): jpayne@69: # Set union jpayne@69: newaddr = AddressList(None) jpayne@69: newaddr.addresslist = self.addresslist[:] jpayne@69: for x in other.addresslist: jpayne@69: if not x in self.addresslist: jpayne@69: newaddr.addresslist.append(x) jpayne@69: return newaddr jpayne@69: jpayne@69: def __iadd__(self, other): jpayne@69: # Set union, in-place jpayne@69: for x in other.addresslist: jpayne@69: if not x in self.addresslist: jpayne@69: self.addresslist.append(x) jpayne@69: return self jpayne@69: jpayne@69: def __sub__(self, other): jpayne@69: # Set difference jpayne@69: newaddr = AddressList(None) jpayne@69: for x in self.addresslist: jpayne@69: if not x in other.addresslist: jpayne@69: newaddr.addresslist.append(x) jpayne@69: return newaddr jpayne@69: jpayne@69: def __isub__(self, other): jpayne@69: # Set difference, in-place jpayne@69: for x in other.addresslist: jpayne@69: if x in self.addresslist: jpayne@69: self.addresslist.remove(x) jpayne@69: return self jpayne@69: jpayne@69: def __getitem__(self, index): jpayne@69: # Make indexing, slices, and 'in' work jpayne@69: return self.addresslist[index]