1r"""HTTP cookie handling for web clients. 2 3This module has (now fairly distant) origins in Gisle Aas' Perl module 4HTTP::Cookies, from the libwww-perl library. 5 6Docstrings, comments and debug strings in this code refer to the 7attributes of the HTTP cookie system as cookie-attributes, to distinguish 8them clearly from Python attributes. 9 10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not 11distributed with the Python standard library, but are available from 12http://wwwsearch.sf.net/): 13 14 CookieJar____ 15 / \ \ 16 FileCookieJar \ \ 17 / | \ \ \ 18 MozillaCookieJar | LWPCookieJar \ \ 19 | | \ 20 | ---MSIEBase | \ 21 | / | | \ 22 | / MSIEDBCookieJar BSDDBCookieJar 23 |/ 24 MSIECookieJar 25 26""" 27 28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] 30 31import os 32import copy 33import datetime 34import re 35import time 36import urllib.parse, urllib.request 37import threading as _threading 38import http.client # only for the default HTTP port 39from calendar import timegm 40 41debug = False # set to True to enable debugging via the logging module 42logger = None 43 44def _debug(*args): 45 if not debug: 46 return 47 global logger 48 if not logger: 49 import logging 50 logger = logging.getLogger("http.cookiejar") 51 return logger.debug(*args) 52 53HTTPONLY_ATTR = "HTTPOnly" 54HTTPONLY_PREFIX = "#HttpOnly_" 55DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT) 56NETSCAPE_MAGIC_RGX = re.compile("#( Netscape)? HTTP Cookie File") 57MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 58 "instance initialised with one)") 59NETSCAPE_HEADER_TEXT = """\ 60# Netscape HTTP Cookie File 61# http://curl.haxx.se/rfc/cookie_spec.html 62# This is a generated file! Do not edit. 63 64""" 65 66def _warn_unhandled_exception(): 67 # There are a few catch-all except: statements in this module, for 68 # catching input that's bad in unexpected ways. Warn if any 69 # exceptions are caught there. 70 import io, warnings, traceback 71 f = io.StringIO() 72 traceback.print_exc(None, f) 73 msg = f.getvalue() 74 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) 75 76 77# Date/time conversion 78# ----------------------------------------------------------------------------- 79 80EPOCH_YEAR = 1970 81def _timegm(tt): 82 year, month, mday, hour, min, sec = tt[:6] 83 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 84 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 85 return timegm(tt) 86 else: 87 return None 88 89DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 90MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 91 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 92MONTHS_LOWER = [month.lower() for month in MONTHS] 93 94def time2isoz(t=None): 95 """Return a string representing time in seconds since epoch, t. 96 97 If the function is called without an argument, it will use the current 98 time. 99 100 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 101 representing Universal Time (UTC, aka GMT). An example of this format is: 102 103 1994-11-24 08:49:37Z 104 105 """ 106 if t is None: 107 dt = datetime.datetime.utcnow() 108 else: 109 dt = datetime.datetime.utcfromtimestamp(t) 110 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 111 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) 112 113def time2netscape(t=None): 114 """Return a string representing time in seconds since epoch, t. 115 116 If the function is called without an argument, it will use the current 117 time. 118 119 The format of the returned string is like this: 120 121 Wed, DD-Mon-YYYY HH:MM:SS GMT 122 123 """ 124 if t is None: 125 dt = datetime.datetime.utcnow() 126 else: 127 dt = datetime.datetime.utcfromtimestamp(t) 128 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % ( 129 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1], 130 dt.year, dt.hour, dt.minute, dt.second) 131 132 133UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 134 135TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) 136def offset_from_tz_string(tz): 137 offset = None 138 if tz in UTC_ZONES: 139 offset = 0 140 else: 141 m = TIMEZONE_RE.search(tz) 142 if m: 143 offset = 3600 * int(m.group(2)) 144 if m.group(3): 145 offset = offset + 60 * int(m.group(3)) 146 if m.group(1) == '-': 147 offset = -offset 148 return offset 149 150def _str2time(day, mon, yr, hr, min, sec, tz): 151 yr = int(yr) 152 if yr > datetime.MAXYEAR: 153 return None 154 155 # translate month name to number 156 # month numbers start with 1 (January) 157 try: 158 mon = MONTHS_LOWER.index(mon.lower())+1 159 except ValueError: 160 # maybe it's already a number 161 try: 162 imon = int(mon) 163 except ValueError: 164 return None 165 if 1 <= imon <= 12: 166 mon = imon 167 else: 168 return None 169 170 # make sure clock elements are defined 171 if hr is None: hr = 0 172 if min is None: min = 0 173 if sec is None: sec = 0 174 175 day = int(day) 176 hr = int(hr) 177 min = int(min) 178 sec = int(sec) 179 180 if yr < 1000: 181 # find "obvious" year 182 cur_yr = time.localtime(time.time())[0] 183 m = cur_yr % 100 184 tmp = yr 185 yr = yr + cur_yr - m 186 m = m - tmp 187 if abs(m) > 50: 188 if m > 0: yr = yr + 100 189 else: yr = yr - 100 190 191 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 192 t = _timegm((yr, mon, day, hr, min, sec, tz)) 193 194 if t is not None: 195 # adjust time using timezone string, to get absolute time since epoch 196 if tz is None: 197 tz = "UTC" 198 tz = tz.upper() 199 offset = offset_from_tz_string(tz) 200 if offset is None: 201 return None 202 t = t - offset 203 204 return t 205 206STRICT_DATE_RE = re.compile( 207 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 208 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) 209WEEKDAY_RE = re.compile( 210 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) 211LOOSE_HTTP_DATE_RE = re.compile( 212 r"""^ 213 (\d\d?) # day 214 (?:\s+|[-\/]) 215 (\w+) # month 216 (?:\s+|[-\/]) 217 (\d+) # year 218 (?: 219 (?:\s+|:) # separator before clock 220 (\d\d?):(\d\d) # hour:min 221 (?::(\d\d))? # optional seconds 222 )? # optional clock 223 \s* 224 (?: 225 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+) # timezone 226 \s* 227 )? 228 (?: 229 \(\w+\) # ASCII representation of timezone in parens. 230 \s* 231 )?$""", re.X | re.ASCII) 232def http2time(text): 233 """Returns time in seconds since epoch of time represented by a string. 234 235 Return value is an integer. 236 237 None is returned if the format of str is unrecognized, the time is outside 238 the representable range, or the timezone string is not recognized. If the 239 string contains no timezone, UTC is assumed. 240 241 The timezone in the string may be numerical (like "-0800" or "+0100") or a 242 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 243 timezone strings equivalent to UTC (zero offset) are known to the function. 244 245 The function loosely parses the following formats: 246 247 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 248 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 249 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 250 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 251 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 252 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 253 254 The parser ignores leading and trailing whitespace. The time may be 255 absent. 256 257 If the year is given with only 2 digits, the function will select the 258 century that makes the year closest to the current date. 259 260 """ 261 # fast exit for strictly conforming string 262 m = STRICT_DATE_RE.search(text) 263 if m: 264 g = m.groups() 265 mon = MONTHS_LOWER.index(g[1].lower()) + 1 266 tt = (int(g[2]), mon, int(g[0]), 267 int(g[3]), int(g[4]), float(g[5])) 268 return _timegm(tt) 269 270 # No, we need some messy parsing... 271 272 # clean up 273 text = text.lstrip() 274 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 275 276 # tz is time zone specifier string 277 day, mon, yr, hr, min, sec, tz = [None]*7 278 279 # loose regexp parse 280 m = LOOSE_HTTP_DATE_RE.search(text) 281 if m is not None: 282 day, mon, yr, hr, min, sec, tz = m.groups() 283 else: 284 return None # bad format 285 286 return _str2time(day, mon, yr, hr, min, sec, tz) 287 288ISO_DATE_RE = re.compile( 289 r"""^ 290 (\d{4}) # year 291 [-\/]? 292 (\d\d?) # numerical month 293 [-\/]? 294 (\d\d?) # day 295 (?: 296 (?:\s+|[-:Tt]) # separator before clock 297 (\d\d?):?(\d\d) # hour:min 298 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 299 )? # optional clock 300 \s* 301 (?: 302 ([-+]?\d\d?:?(:?\d\d)? 303 |Z|z) # timezone (Z is "zero meridian", i.e. GMT) 304 \s* 305 )?$""", re.X | re. ASCII) 306def iso2time(text): 307 """ 308 As for http2time, but parses the ISO 8601 formats: 309 310 1994-02-03 14:15:29 -0100 -- ISO 8601 format 311 1994-02-03 14:15:29 -- zone is optional 312 1994-02-03 -- only date 313 1994-02-03T14:15:29 -- Use T as separator 314 19940203T141529Z -- ISO 8601 compact format 315 19940203 -- only date 316 317 """ 318 # clean up 319 text = text.lstrip() 320 321 # tz is time zone specifier string 322 day, mon, yr, hr, min, sec, tz = [None]*7 323 324 # loose regexp parse 325 m = ISO_DATE_RE.search(text) 326 if m is not None: 327 # XXX there's an extra bit of the timezone I'm ignoring here: is 328 # this the right thing to do? 329 yr, mon, day, hr, min, sec, tz, _ = m.groups() 330 else: 331 return None # bad format 332 333 return _str2time(day, mon, yr, hr, min, sec, tz) 334 335 336# Header parsing 337# ----------------------------------------------------------------------------- 338 339def unmatched(match): 340 """Return unmatched part of re.Match object.""" 341 start, end = match.span(0) 342 return match.string[:start]+match.string[end:] 343 344HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 345HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 346HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 347HEADER_ESCAPE_RE = re.compile(r"\\(.)") 348def split_header_words(header_values): 349 r"""Parse header values into a list of lists containing key,value pairs. 350 351 The function knows how to deal with ",", ";" and "=" as well as quoted 352 values after "=". A list of space separated tokens are parsed as if they 353 were separated by ";". 354 355 If the header_values passed as argument contains multiple values, then they 356 are treated as if they were a single value separated by comma ",". 357 358 This means that this function is useful for parsing header fields that 359 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 360 the requirement for tokens). 361 362 headers = #header 363 header = (token | parameter) *( [";"] (token | parameter)) 364 365 token = 1*<any CHAR except CTLs or separators> 366 separators = "(" | ")" | "<" | ">" | "@" 367 | "," | ";" | ":" | "\" | <"> 368 | "/" | "[" | "]" | "?" | "=" 369 | "{" | "}" | SP | HT 370 371 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 372 qdtext = <any TEXT except <">> 373 quoted-pair = "\" CHAR 374 375 parameter = attribute "=" value 376 attribute = token 377 value = token | quoted-string 378 379 Each header is represented by a list of key/value pairs. The value for a 380 simple token (not part of a parameter) is None. Syntactically incorrect 381 headers will not necessarily be parsed as you would want. 382 383 This is easier to describe with some examples: 384 385 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 386 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 387 >>> split_header_words(['text/html; charset="iso-8859-1"']) 388 [[('text/html', None), ('charset', 'iso-8859-1')]] 389 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 390 [[('Basic', None), ('realm', '"foobar"')]] 391 392 """ 393 assert not isinstance(header_values, str) 394 result = [] 395 for text in header_values: 396 orig_text = text 397 pairs = [] 398 while text: 399 m = HEADER_TOKEN_RE.search(text) 400 if m: 401 text = unmatched(m) 402 name = m.group(1) 403 m = HEADER_QUOTED_VALUE_RE.search(text) 404 if m: # quoted value 405 text = unmatched(m) 406 value = m.group(1) 407 value = HEADER_ESCAPE_RE.sub(r"\1", value) 408 else: 409 m = HEADER_VALUE_RE.search(text) 410 if m: # unquoted value 411 text = unmatched(m) 412 value = m.group(1) 413 value = value.rstrip() 414 else: 415 # no value, a lone token 416 value = None 417 pairs.append((name, value)) 418 elif text.lstrip().startswith(","): 419 # concatenated headers, as per RFC 2616 section 4.2 420 text = text.lstrip()[1:] 421 if pairs: result.append(pairs) 422 pairs = [] 423 else: 424 # skip junk 425 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text) 426 assert nr_junk_chars > 0, ( 427 "split_header_words bug: '%s', '%s', %s" % 428 (orig_text, text, pairs)) 429 text = non_junk 430 if pairs: result.append(pairs) 431 return result 432 433HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 434def join_header_words(lists): 435 """Do the inverse (almost) of the conversion done by split_header_words. 436 437 Takes a list of lists of (key, value) pairs and produces a single header 438 value. Attribute values are quoted if needed. 439 440 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]]) 441 'text/plain; charset="iso-8859-1"' 442 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]]) 443 'text/plain, charset="iso-8859-1"' 444 445 """ 446 headers = [] 447 for pairs in lists: 448 attr = [] 449 for k, v in pairs: 450 if v is not None: 451 if not re.search(r"^\w+$", v): 452 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 453 v = '"%s"' % v 454 k = "%s=%s" % (k, v) 455 attr.append(k) 456 if attr: headers.append("; ".join(attr)) 457 return ", ".join(headers) 458 459def strip_quotes(text): 460 if text.startswith('"'): 461 text = text[1:] 462 if text.endswith('"'): 463 text = text[:-1] 464 return text 465 466def parse_ns_headers(ns_headers): 467 """Ad-hoc parser for Netscape protocol cookie-attributes. 468 469 The old Netscape cookie format for Set-Cookie can for instance contain 470 an unquoted "," in the expires field, so we have to use this ad-hoc 471 parser instead of split_header_words. 472 473 XXX This may not make the best possible effort to parse all the crap 474 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 475 parser is probably better, so could do worse than following that if 476 this ever gives any trouble. 477 478 Currently, this is also used for parsing RFC 2109 cookies. 479 480 """ 481 known_attrs = ("expires", "domain", "path", "secure", 482 # RFC 2109 attrs (may turn up in Netscape cookies, too) 483 "version", "port", "max-age") 484 485 result = [] 486 for ns_header in ns_headers: 487 pairs = [] 488 version_set = False 489 490 # XXX: The following does not strictly adhere to RFCs in that empty 491 # names and values are legal (the former will only appear once and will 492 # be overwritten if multiple occurrences are present). This is 493 # mostly to deal with backwards compatibility. 494 for ii, param in enumerate(ns_header.split(';')): 495 param = param.strip() 496 497 key, sep, val = param.partition('=') 498 key = key.strip() 499 500 if not key: 501 if ii == 0: 502 break 503 else: 504 continue 505 506 # allow for a distinction between present and empty and missing 507 # altogether 508 val = val.strip() if sep else None 509 510 if ii != 0: 511 lc = key.lower() 512 if lc in known_attrs: 513 key = lc 514 515 if key == "version": 516 # This is an RFC 2109 cookie. 517 if val is not None: 518 val = strip_quotes(val) 519 version_set = True 520 elif key == "expires": 521 # convert expires date to seconds since epoch 522 if val is not None: 523 val = http2time(strip_quotes(val)) # None if invalid 524 pairs.append((key, val)) 525 526 if pairs: 527 if not version_set: 528 pairs.append(("version", "0")) 529 result.append(pairs) 530 531 return result 532 533 534IPV4_RE = re.compile(r"\.\d+$", re.ASCII) 535def is_HDN(text): 536 """Return True if text is a host domain name.""" 537 # XXX 538 # This may well be wrong. Which RFC is HDN defined in, if any (for 539 # the purposes of RFC 2965)? 540 # For the current implementation, what about IPv6? Remember to look 541 # at other uses of IPV4_RE also, if change this. 542 if IPV4_RE.search(text): 543 return False 544 if text == "": 545 return False 546 if text[0] == "." or text[-1] == ".": 547 return False 548 return True 549 550def domain_match(A, B): 551 """Return True if domain A domain-matches domain B, according to RFC 2965. 552 553 A and B may be host domain names or IP addresses. 554 555 RFC 2965, section 1: 556 557 Host names can be specified either as an IP address or a HDN string. 558 Sometimes we compare one host name with another. (Such comparisons SHALL 559 be case-insensitive.) Host A's name domain-matches host B's if 560 561 * their host name strings string-compare equal; or 562 563 * A is a HDN string and has the form NB, where N is a non-empty 564 name string, B has the form .B', and B' is a HDN string. (So, 565 x.y.com domain-matches .Y.com but not Y.com.) 566 567 Note that domain-match is not a commutative operation: a.b.c.com 568 domain-matches .c.com, but not the reverse. 569 570 """ 571 # Note that, if A or B are IP addresses, the only relevant part of the 572 # definition of the domain-match algorithm is the direct string-compare. 573 A = A.lower() 574 B = B.lower() 575 if A == B: 576 return True 577 if not is_HDN(A): 578 return False 579 i = A.rfind(B) 580 if i == -1 or i == 0: 581 # A does not have form NB, or N is the empty string 582 return False 583 if not B.startswith("."): 584 return False 585 if not is_HDN(B[1:]): 586 return False 587 return True 588 589def liberal_is_HDN(text): 590 """Return True if text is a sort-of-like a host domain name. 591 592 For accepting/blocking domains. 593 594 """ 595 if IPV4_RE.search(text): 596 return False 597 return True 598 599def user_domain_match(A, B): 600 """For blocking/accepting domains. 601 602 A and B may be host domain names or IP addresses. 603 604 """ 605 A = A.lower() 606 B = B.lower() 607 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 608 if A == B: 609 # equal IP addresses 610 return True 611 return False 612 initial_dot = B.startswith(".") 613 if initial_dot and A.endswith(B): 614 return True 615 if not initial_dot and A == B: 616 return True 617 return False 618 619cut_port_re = re.compile(r":\d+$", re.ASCII) 620def request_host(request): 621 """Return request-host, as defined by RFC 2965. 622 623 Variation from RFC: returned value is lowercased, for convenient 624 comparison. 625 626 """ 627 url = request.get_full_url() 628 host = urllib.parse.urlparse(url)[1] 629 if host == "": 630 host = request.get_header("Host", "") 631 632 # remove port, if present 633 host = cut_port_re.sub("", host, 1) 634 return host.lower() 635 636def eff_request_host(request): 637 """Return a tuple (request-host, effective request-host name). 638 639 As defined by RFC 2965, except both are lowercased. 640 641 """ 642 erhn = req_host = request_host(request) 643 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 644 erhn = req_host + ".local" 645 return req_host, erhn 646 647def request_path(request): 648 """Path component of request-URI, as defined by RFC 2965.""" 649 url = request.get_full_url() 650 parts = urllib.parse.urlsplit(url) 651 path = escape_path(parts.path) 652 if not path.startswith("/"): 653 # fix bad RFC 2396 absoluteURI 654 path = "/" + path 655 return path 656 657def request_port(request): 658 host = request.host 659 i = host.find(':') 660 if i >= 0: 661 port = host[i+1:] 662 try: 663 int(port) 664 except ValueError: 665 _debug("nonnumeric port: '%s'", port) 666 return None 667 else: 668 port = DEFAULT_HTTP_PORT 669 return port 670 671# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 672# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 673HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 674ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 675def uppercase_escaped_char(match): 676 return "%%%s" % match.group(1).upper() 677def escape_path(path): 678 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 679 # There's no knowing what character encoding was used to create URLs 680 # containing %-escapes, but since we have to pick one to escape invalid 681 # path characters, we pick UTF-8, as recommended in the HTML 4.0 682 # specification: 683 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 684 # And here, kind of: draft-fielding-uri-rfc2396bis-03 685 # (And in draft IRI specification: draft-duerst-iri-05) 686 # (And here, for new URI schemes: RFC 2718) 687 path = urllib.parse.quote(path, HTTP_PATH_SAFE) 688 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 689 return path 690 691def reach(h): 692 """Return reach of host h, as defined by RFC 2965, section 1. 693 694 The reach R of a host name H is defined as follows: 695 696 * If 697 698 - H is the host domain name of a host; and, 699 700 - H has the form A.B; and 701 702 - A has no embedded (that is, interior) dots; and 703 704 - B has at least one embedded dot, or B is the string "local". 705 then the reach of H is .B. 706 707 * Otherwise, the reach of H is H. 708 709 >>> reach("www.acme.com") 710 '.acme.com' 711 >>> reach("acme.com") 712 'acme.com' 713 >>> reach("acme.local") 714 '.local' 715 716 """ 717 i = h.find(".") 718 if i >= 0: 719 #a = h[:i] # this line is only here to show what a is 720 b = h[i+1:] 721 i = b.find(".") 722 if is_HDN(h) and (i >= 0 or b == "local"): 723 return "."+b 724 return h 725 726def is_third_party(request): 727 """ 728 729 RFC 2965, section 3.3.6: 730 731 An unverifiable transaction is to a third-party host if its request- 732 host U does not domain-match the reach R of the request-host O in the 733 origin transaction. 734 735 """ 736 req_host = request_host(request) 737 if not domain_match(req_host, reach(request.origin_req_host)): 738 return True 739 else: 740 return False 741 742 743class Cookie: 744 """HTTP Cookie. 745 746 This class represents both Netscape and RFC 2965 cookies. 747 748 This is deliberately a very simple class. It just holds attributes. It's 749 possible to construct Cookie instances that don't comply with the cookie 750 standards. CookieJar.make_cookies is the factory function for Cookie 751 objects -- it deals with cookie parsing, supplying defaults, and 752 normalising to the representation used in this class. CookiePolicy is 753 responsible for checking them to see whether they should be accepted from 754 and returned to the server. 755 756 Note that the port may be present in the headers, but unspecified ("Port" 757 rather than"Port=80", for example); if this is the case, port is None. 758 759 """ 760 761 def __init__(self, version, name, value, 762 port, port_specified, 763 domain, domain_specified, domain_initial_dot, 764 path, path_specified, 765 secure, 766 expires, 767 discard, 768 comment, 769 comment_url, 770 rest, 771 rfc2109=False, 772 ): 773 774 if version is not None: version = int(version) 775 if expires is not None: expires = int(float(expires)) 776 if port is None and port_specified is True: 777 raise ValueError("if port is None, port_specified must be false") 778 779 self.version = version 780 self.name = name 781 self.value = value 782 self.port = port 783 self.port_specified = port_specified 784 # normalise case, as per RFC 2965 section 3.3.3 785 self.domain = domain.lower() 786 self.domain_specified = domain_specified 787 # Sigh. We need to know whether the domain given in the 788 # cookie-attribute had an initial dot, in order to follow RFC 2965 789 # (as clarified in draft errata). Needed for the returned $Domain 790 # value. 791 self.domain_initial_dot = domain_initial_dot 792 self.path = path 793 self.path_specified = path_specified 794 self.secure = secure 795 self.expires = expires 796 self.discard = discard 797 self.comment = comment 798 self.comment_url = comment_url 799 self.rfc2109 = rfc2109 800 801 self._rest = copy.copy(rest) 802 803 def has_nonstandard_attr(self, name): 804 return name in self._rest 805 def get_nonstandard_attr(self, name, default=None): 806 return self._rest.get(name, default) 807 def set_nonstandard_attr(self, name, value): 808 self._rest[name] = value 809 810 def is_expired(self, now=None): 811 if now is None: now = time.time() 812 if (self.expires is not None) and (self.expires <= now): 813 return True 814 return False 815 816 def __str__(self): 817 if self.port is None: p = "" 818 else: p = ":"+self.port 819 limit = self.domain + p + self.path 820 if self.value is not None: 821 namevalue = "%s=%s" % (self.name, self.value) 822 else: 823 namevalue = self.name 824 return "<Cookie %s for %s>" % (namevalue, limit) 825 826 def __repr__(self): 827 args = [] 828 for name in ("version", "name", "value", 829 "port", "port_specified", 830 "domain", "domain_specified", "domain_initial_dot", 831 "path", "path_specified", 832 "secure", "expires", "discard", "comment", "comment_url", 833 ): 834 attr = getattr(self, name) 835 args.append("%s=%s" % (name, repr(attr))) 836 args.append("rest=%s" % repr(self._rest)) 837 args.append("rfc2109=%s" % repr(self.rfc2109)) 838 return "%s(%s)" % (self.__class__.__name__, ", ".join(args)) 839 840 841class CookiePolicy: 842 """Defines which cookies get accepted from and returned to server. 843 844 May also modify cookies, though this is probably a bad idea. 845 846 The subclass DefaultCookiePolicy defines the standard rules for Netscape 847 and RFC 2965 cookies -- override that if you want a customized policy. 848 849 """ 850 def set_ok(self, cookie, request): 851 """Return true if (and only if) cookie should be accepted from server. 852 853 Currently, pre-expired cookies never get this far -- the CookieJar 854 class deletes such cookies itself. 855 856 """ 857 raise NotImplementedError() 858 859 def return_ok(self, cookie, request): 860 """Return true if (and only if) cookie should be returned to server.""" 861 raise NotImplementedError() 862 863 def domain_return_ok(self, domain, request): 864 """Return false if cookies should not be returned, given cookie domain. 865 """ 866 return True 867 868 def path_return_ok(self, path, request): 869 """Return false if cookies should not be returned, given cookie path. 870 """ 871 return True 872 873 874class DefaultCookiePolicy(CookiePolicy): 875 """Implements the standard rules for accepting and returning cookies.""" 876 877 DomainStrictNoDots = 1 878 DomainStrictNonDomain = 2 879 DomainRFC2965Match = 4 880 881 DomainLiberal = 0 882 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 883 884 def __init__(self, 885 blocked_domains=None, allowed_domains=None, 886 netscape=True, rfc2965=False, 887 rfc2109_as_netscape=None, 888 hide_cookie2=False, 889 strict_domain=False, 890 strict_rfc2965_unverifiable=True, 891 strict_ns_unverifiable=False, 892 strict_ns_domain=DomainLiberal, 893 strict_ns_set_initial_dollar=False, 894 strict_ns_set_path=False, 895 secure_protocols=("https", "wss") 896 ): 897 """Constructor arguments should be passed as keyword arguments only.""" 898 self.netscape = netscape 899 self.rfc2965 = rfc2965 900 self.rfc2109_as_netscape = rfc2109_as_netscape 901 self.hide_cookie2 = hide_cookie2 902 self.strict_domain = strict_domain 903 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 904 self.strict_ns_unverifiable = strict_ns_unverifiable 905 self.strict_ns_domain = strict_ns_domain 906 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 907 self.strict_ns_set_path = strict_ns_set_path 908 self.secure_protocols = secure_protocols 909 910 if blocked_domains is not None: 911 self._blocked_domains = tuple(blocked_domains) 912 else: 913 self._blocked_domains = () 914 915 if allowed_domains is not None: 916 allowed_domains = tuple(allowed_domains) 917 self._allowed_domains = allowed_domains 918 919 def blocked_domains(self): 920 """Return the sequence of blocked domains (as a tuple).""" 921 return self._blocked_domains 922 def set_blocked_domains(self, blocked_domains): 923 """Set the sequence of blocked domains.""" 924 self._blocked_domains = tuple(blocked_domains) 925 926 def is_blocked(self, domain): 927 for blocked_domain in self._blocked_domains: 928 if user_domain_match(domain, blocked_domain): 929 return True 930 return False 931 932 def allowed_domains(self): 933 """Return None, or the sequence of allowed domains (as a tuple).""" 934 return self._allowed_domains 935 def set_allowed_domains(self, allowed_domains): 936 """Set the sequence of allowed domains, or None.""" 937 if allowed_domains is not None: 938 allowed_domains = tuple(allowed_domains) 939 self._allowed_domains = allowed_domains 940 941 def is_not_allowed(self, domain): 942 if self._allowed_domains is None: 943 return False 944 for allowed_domain in self._allowed_domains: 945 if user_domain_match(domain, allowed_domain): 946 return False 947 return True 948 949 def set_ok(self, cookie, request): 950 """ 951 If you override .set_ok(), be sure to call this method. If it returns 952 false, so should your subclass (assuming your subclass wants to be more 953 strict about which cookies to accept). 954 955 """ 956 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 957 958 assert cookie.name is not None 959 960 for n in "version", "verifiability", "name", "path", "domain", "port": 961 fn_name = "set_ok_"+n 962 fn = getattr(self, fn_name) 963 if not fn(cookie, request): 964 return False 965 966 return True 967 968 def set_ok_version(self, cookie, request): 969 if cookie.version is None: 970 # Version is always set to 0 by parse_ns_headers if it's a Netscape 971 # cookie, so this must be an invalid RFC 2965 cookie. 972 _debug(" Set-Cookie2 without version attribute (%s=%s)", 973 cookie.name, cookie.value) 974 return False 975 if cookie.version > 0 and not self.rfc2965: 976 _debug(" RFC 2965 cookies are switched off") 977 return False 978 elif cookie.version == 0 and not self.netscape: 979 _debug(" Netscape cookies are switched off") 980 return False 981 return True 982 983 def set_ok_verifiability(self, cookie, request): 984 if request.unverifiable and is_third_party(request): 985 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 986 _debug(" third-party RFC 2965 cookie during " 987 "unverifiable transaction") 988 return False 989 elif cookie.version == 0 and self.strict_ns_unverifiable: 990 _debug(" third-party Netscape cookie during " 991 "unverifiable transaction") 992 return False 993 return True 994 995 def set_ok_name(self, cookie, request): 996 # Try and stop servers setting V0 cookies designed to hack other 997 # servers that know both V0 and V1 protocols. 998 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 999 cookie.name.startswith("$")): 1000 _debug(" illegal name (starts with '$'): '%s'", cookie.name) 1001 return False 1002 return True 1003 1004 def set_ok_path(self, cookie, request): 1005 if cookie.path_specified: 1006 req_path = request_path(request) 1007 if ((cookie.version > 0 or 1008 (cookie.version == 0 and self.strict_ns_set_path)) and 1009 not self.path_return_ok(cookie.path, request)): 1010 _debug(" path attribute %s is not a prefix of request " 1011 "path %s", cookie.path, req_path) 1012 return False 1013 return True 1014 1015 def set_ok_domain(self, cookie, request): 1016 if self.is_blocked(cookie.domain): 1017 _debug(" domain %s is in user block-list", cookie.domain) 1018 return False 1019 if self.is_not_allowed(cookie.domain): 1020 _debug(" domain %s is not in user allow-list", cookie.domain) 1021 return False 1022 if cookie.domain_specified: 1023 req_host, erhn = eff_request_host(request) 1024 domain = cookie.domain 1025 if self.strict_domain and (domain.count(".") >= 2): 1026 # XXX This should probably be compared with the Konqueror 1027 # (kcookiejar.cpp) and Mozilla implementations, but it's a 1028 # losing battle. 1029 i = domain.rfind(".") 1030 j = domain.rfind(".", 0, i) 1031 if j == 0: # domain like .foo.bar 1032 tld = domain[i+1:] 1033 sld = domain[j+1:i] 1034 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", 1035 "gov", "mil", "int", "aero", "biz", "cat", "coop", 1036 "info", "jobs", "mobi", "museum", "name", "pro", 1037 "travel", "eu") and len(tld) == 2: 1038 # domain like .co.uk 1039 _debug(" country-code second level domain %s", domain) 1040 return False 1041 if domain.startswith("."): 1042 undotted_domain = domain[1:] 1043 else: 1044 undotted_domain = domain 1045 embedded_dots = (undotted_domain.find(".") >= 0) 1046 if not embedded_dots and not erhn.endswith(".local"): 1047 _debug(" non-local domain %s contains no embedded dot", 1048 domain) 1049 return False 1050 if cookie.version == 0: 1051 if (not (erhn.endswith(domain) or 1052 erhn.endswith(f"{undotted_domain}.local")) and 1053 (not erhn.startswith(".") and 1054 not ("."+erhn).endswith(domain))): 1055 _debug(" effective request-host %s (even with added " 1056 "initial dot) does not end with %s", 1057 erhn, domain) 1058 return False 1059 if (cookie.version > 0 or 1060 (self.strict_ns_domain & self.DomainRFC2965Match)): 1061 if not domain_match(erhn, domain): 1062 _debug(" effective request-host %s does not domain-match " 1063 "%s", erhn, domain) 1064 return False 1065 if (cookie.version > 0 or 1066 (self.strict_ns_domain & self.DomainStrictNoDots)): 1067 host_prefix = req_host[:-len(domain)] 1068 if (host_prefix.find(".") >= 0 and 1069 not IPV4_RE.search(req_host)): 1070 _debug(" host prefix %s for domain %s contains a dot", 1071 host_prefix, domain) 1072 return False 1073 return True 1074 1075 def set_ok_port(self, cookie, request): 1076 if cookie.port_specified: 1077 req_port = request_port(request) 1078 if req_port is None: 1079 req_port = "80" 1080 else: 1081 req_port = str(req_port) 1082 for p in cookie.port.split(","): 1083 try: 1084 int(p) 1085 except ValueError: 1086 _debug(" bad port %s (not numeric)", p) 1087 return False 1088 if p == req_port: 1089 break 1090 else: 1091 _debug(" request port (%s) not found in %s", 1092 req_port, cookie.port) 1093 return False 1094 return True 1095 1096 def return_ok(self, cookie, request): 1097 """ 1098 If you override .return_ok(), be sure to call this method. If it 1099 returns false, so should your subclass (assuming your subclass wants to 1100 be more strict about which cookies to return). 1101 1102 """ 1103 # Path has already been checked by .path_return_ok(), and domain 1104 # blocking done by .domain_return_ok(). 1105 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1106 1107 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1108 fn_name = "return_ok_"+n 1109 fn = getattr(self, fn_name) 1110 if not fn(cookie, request): 1111 return False 1112 return True 1113 1114 def return_ok_version(self, cookie, request): 1115 if cookie.version > 0 and not self.rfc2965: 1116 _debug(" RFC 2965 cookies are switched off") 1117 return False 1118 elif cookie.version == 0 and not self.netscape: 1119 _debug(" Netscape cookies are switched off") 1120 return False 1121 return True 1122 1123 def return_ok_verifiability(self, cookie, request): 1124 if request.unverifiable and is_third_party(request): 1125 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1126 _debug(" third-party RFC 2965 cookie during unverifiable " 1127 "transaction") 1128 return False 1129 elif cookie.version == 0 and self.strict_ns_unverifiable: 1130 _debug(" third-party Netscape cookie during unverifiable " 1131 "transaction") 1132 return False 1133 return True 1134 1135 def return_ok_secure(self, cookie, request): 1136 if cookie.secure and request.type not in self.secure_protocols: 1137 _debug(" secure cookie with non-secure request") 1138 return False 1139 return True 1140 1141 def return_ok_expires(self, cookie, request): 1142 if cookie.is_expired(self._now): 1143 _debug(" cookie expired") 1144 return False 1145 return True 1146 1147 def return_ok_port(self, cookie, request): 1148 if cookie.port: 1149 req_port = request_port(request) 1150 if req_port is None: 1151 req_port = "80" 1152 for p in cookie.port.split(","): 1153 if p == req_port: 1154 break 1155 else: 1156 _debug(" request port %s does not match cookie port %s", 1157 req_port, cookie.port) 1158 return False 1159 return True 1160 1161 def return_ok_domain(self, cookie, request): 1162 req_host, erhn = eff_request_host(request) 1163 domain = cookie.domain 1164 1165 if domain and not domain.startswith("."): 1166 dotdomain = "." + domain 1167 else: 1168 dotdomain = domain 1169 1170 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1171 if (cookie.version == 0 and 1172 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1173 not cookie.domain_specified and domain != erhn): 1174 _debug(" cookie with unspecified domain does not string-compare " 1175 "equal to request domain") 1176 return False 1177 1178 if cookie.version > 0 and not domain_match(erhn, domain): 1179 _debug(" effective request-host name %s does not domain-match " 1180 "RFC 2965 cookie domain %s", erhn, domain) 1181 return False 1182 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain): 1183 _debug(" request-host %s does not match Netscape cookie domain " 1184 "%s", req_host, domain) 1185 return False 1186 return True 1187 1188 def domain_return_ok(self, domain, request): 1189 # Liberal check of. This is here as an optimization to avoid 1190 # having to load lots of MSIE cookie files unless necessary. 1191 req_host, erhn = eff_request_host(request) 1192 if not req_host.startswith("."): 1193 req_host = "."+req_host 1194 if not erhn.startswith("."): 1195 erhn = "."+erhn 1196 if domain and not domain.startswith("."): 1197 dotdomain = "." + domain 1198 else: 1199 dotdomain = domain 1200 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)): 1201 #_debug(" request domain %s does not match cookie domain %s", 1202 # req_host, domain) 1203 return False 1204 1205 if self.is_blocked(domain): 1206 _debug(" domain %s is in user block-list", domain) 1207 return False 1208 if self.is_not_allowed(domain): 1209 _debug(" domain %s is not in user allow-list", domain) 1210 return False 1211 1212 return True 1213 1214 def path_return_ok(self, path, request): 1215 _debug("- checking cookie path=%s", path) 1216 req_path = request_path(request) 1217 pathlen = len(path) 1218 if req_path == path: 1219 return True 1220 elif (req_path.startswith(path) and 1221 (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")): 1222 return True 1223 1224 _debug(" %s does not path-match %s", req_path, path) 1225 return False 1226 1227def deepvalues(mapping): 1228 """Iterates over nested mapping, depth-first""" 1229 for obj in list(mapping.values()): 1230 mapping = False 1231 try: 1232 obj.items 1233 except AttributeError: 1234 pass 1235 else: 1236 mapping = True 1237 yield from deepvalues(obj) 1238 if not mapping: 1239 yield obj 1240 1241 1242# Used as second parameter to dict.get() method, to distinguish absent 1243# dict key from one with a None value. 1244class Absent: pass 1245 1246class CookieJar: 1247 """Collection of HTTP cookies. 1248 1249 You may not need to know about this class: try 1250 urllib.request.build_opener(HTTPCookieProcessor).open(url). 1251 """ 1252 1253 non_word_re = re.compile(r"\W") 1254 quote_re = re.compile(r"([\"\\])") 1255 strict_domain_re = re.compile(r"\.?[^.]*") 1256 domain_re = re.compile(r"[^.]*") 1257 dots_re = re.compile(r"^\.+") 1258 1259 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) 1260 1261 def __init__(self, policy=None): 1262 if policy is None: 1263 policy = DefaultCookiePolicy() 1264 self._policy = policy 1265 1266 self._cookies_lock = _threading.RLock() 1267 self._cookies = {} 1268 1269 def set_policy(self, policy): 1270 self._policy = policy 1271 1272 def _cookies_for_domain(self, domain, request): 1273 cookies = [] 1274 if not self._policy.domain_return_ok(domain, request): 1275 return [] 1276 _debug("Checking %s for cookies to return", domain) 1277 cookies_by_path = self._cookies[domain] 1278 for path in cookies_by_path.keys(): 1279 if not self._policy.path_return_ok(path, request): 1280 continue 1281 cookies_by_name = cookies_by_path[path] 1282 for cookie in cookies_by_name.values(): 1283 if not self._policy.return_ok(cookie, request): 1284 _debug(" not returning cookie") 1285 continue 1286 _debug(" it's a match") 1287 cookies.append(cookie) 1288 return cookies 1289 1290 def _cookies_for_request(self, request): 1291 """Return a list of cookies to be returned to server.""" 1292 cookies = [] 1293 for domain in self._cookies.keys(): 1294 cookies.extend(self._cookies_for_domain(domain, request)) 1295 return cookies 1296 1297 def _cookie_attrs(self, cookies): 1298 """Return a list of cookie-attributes to be returned to server. 1299 1300 like ['foo="bar"; $Path="/"', ...] 1301 1302 The $Version attribute is also added when appropriate (currently only 1303 once per request). 1304 1305 """ 1306 # add cookies in order of most specific (ie. longest) path first 1307 cookies.sort(key=lambda a: len(a.path), reverse=True) 1308 1309 version_set = False 1310 1311 attrs = [] 1312 for cookie in cookies: 1313 # set version of Cookie header 1314 # XXX 1315 # What should it be if multiple matching Set-Cookie headers have 1316 # different versions themselves? 1317 # Answer: there is no answer; was supposed to be settled by 1318 # RFC 2965 errata, but that may never appear... 1319 version = cookie.version 1320 if not version_set: 1321 version_set = True 1322 if version > 0: 1323 attrs.append("$Version=%s" % version) 1324 1325 # quote cookie value if necessary 1326 # (not for Netscape protocol, which already has any quotes 1327 # intact, due to the poorly-specified Netscape Cookie: syntax) 1328 if ((cookie.value is not None) and 1329 self.non_word_re.search(cookie.value) and version > 0): 1330 value = self.quote_re.sub(r"\\\1", cookie.value) 1331 else: 1332 value = cookie.value 1333 1334 # add cookie-attributes to be returned in Cookie header 1335 if cookie.value is None: 1336 attrs.append(cookie.name) 1337 else: 1338 attrs.append("%s=%s" % (cookie.name, value)) 1339 if version > 0: 1340 if cookie.path_specified: 1341 attrs.append('$Path="%s"' % cookie.path) 1342 if cookie.domain.startswith("."): 1343 domain = cookie.domain 1344 if (not cookie.domain_initial_dot and 1345 domain.startswith(".")): 1346 domain = domain[1:] 1347 attrs.append('$Domain="%s"' % domain) 1348 if cookie.port is not None: 1349 p = "$Port" 1350 if cookie.port_specified: 1351 p = p + ('="%s"' % cookie.port) 1352 attrs.append(p) 1353 1354 return attrs 1355 1356 def add_cookie_header(self, request): 1357 """Add correct Cookie: header to request (urllib.request.Request object). 1358 1359 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1360 1361 """ 1362 _debug("add_cookie_header") 1363 self._cookies_lock.acquire() 1364 try: 1365 1366 self._policy._now = self._now = int(time.time()) 1367 1368 cookies = self._cookies_for_request(request) 1369 1370 attrs = self._cookie_attrs(cookies) 1371 if attrs: 1372 if not request.has_header("Cookie"): 1373 request.add_unredirected_header( 1374 "Cookie", "; ".join(attrs)) 1375 1376 # if necessary, advertise that we know RFC 2965 1377 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1378 not request.has_header("Cookie2")): 1379 for cookie in cookies: 1380 if cookie.version != 1: 1381 request.add_unredirected_header("Cookie2", '$Version="1"') 1382 break 1383 1384 finally: 1385 self._cookies_lock.release() 1386 1387 self.clear_expired_cookies() 1388 1389 def _normalized_cookie_tuples(self, attrs_set): 1390 """Return list of tuples containing normalised cookie information. 1391 1392 attrs_set is the list of lists of key,value pairs extracted from 1393 the Set-Cookie or Set-Cookie2 headers. 1394 1395 Tuples are name, value, standard, rest, where name and value are the 1396 cookie name and value, standard is a dictionary containing the standard 1397 cookie-attributes (discard, secure, version, expires or max-age, 1398 domain, path and port) and rest is a dictionary containing the rest of 1399 the cookie-attributes. 1400 1401 """ 1402 cookie_tuples = [] 1403 1404 boolean_attrs = "discard", "secure" 1405 value_attrs = ("version", 1406 "expires", "max-age", 1407 "domain", "path", "port", 1408 "comment", "commenturl") 1409 1410 for cookie_attrs in attrs_set: 1411 name, value = cookie_attrs[0] 1412 1413 # Build dictionary of standard cookie-attributes (standard) and 1414 # dictionary of other cookie-attributes (rest). 1415 1416 # Note: expiry time is normalised to seconds since epoch. V0 1417 # cookies should have the Expires cookie-attribute, and V1 cookies 1418 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1419 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1420 # accept either (but prefer Max-Age). 1421 max_age_set = False 1422 1423 bad_cookie = False 1424 1425 standard = {} 1426 rest = {} 1427 for k, v in cookie_attrs[1:]: 1428 lc = k.lower() 1429 # don't lose case distinction for unknown fields 1430 if lc in value_attrs or lc in boolean_attrs: 1431 k = lc 1432 if k in boolean_attrs and v is None: 1433 # boolean cookie-attribute is present, but has no value 1434 # (like "discard", rather than "port=80") 1435 v = True 1436 if k in standard: 1437 # only first value is significant 1438 continue 1439 if k == "domain": 1440 if v is None: 1441 _debug(" missing value for domain attribute") 1442 bad_cookie = True 1443 break 1444 # RFC 2965 section 3.3.3 1445 v = v.lower() 1446 if k == "expires": 1447 if max_age_set: 1448 # Prefer max-age to expires (like Mozilla) 1449 continue 1450 if v is None: 1451 _debug(" missing or invalid value for expires " 1452 "attribute: treating as session cookie") 1453 continue 1454 if k == "max-age": 1455 max_age_set = True 1456 try: 1457 v = int(v) 1458 except ValueError: 1459 _debug(" missing or invalid (non-numeric) value for " 1460 "max-age attribute") 1461 bad_cookie = True 1462 break 1463 # convert RFC 2965 Max-Age to seconds since epoch 1464 # XXX Strictly you're supposed to follow RFC 2616 1465 # age-calculation rules. Remember that zero Max-Age 1466 # is a request to discard (old and new) cookie, though. 1467 k = "expires" 1468 v = self._now + v 1469 if (k in value_attrs) or (k in boolean_attrs): 1470 if (v is None and 1471 k not in ("port", "comment", "commenturl")): 1472 _debug(" missing value for %s attribute" % k) 1473 bad_cookie = True 1474 break 1475 standard[k] = v 1476 else: 1477 rest[k] = v 1478 1479 if bad_cookie: 1480 continue 1481 1482 cookie_tuples.append((name, value, standard, rest)) 1483 1484 return cookie_tuples 1485 1486 def _cookie_from_cookie_tuple(self, tup, request): 1487 # standard is dict of standard cookie-attributes, rest is dict of the 1488 # rest of them 1489 name, value, standard, rest = tup 1490 1491 domain = standard.get("domain", Absent) 1492 path = standard.get("path", Absent) 1493 port = standard.get("port", Absent) 1494 expires = standard.get("expires", Absent) 1495 1496 # set the easy defaults 1497 version = standard.get("version", None) 1498 if version is not None: 1499 try: 1500 version = int(version) 1501 except ValueError: 1502 return None # invalid version, ignore cookie 1503 secure = standard.get("secure", False) 1504 # (discard is also set if expires is Absent) 1505 discard = standard.get("discard", False) 1506 comment = standard.get("comment", None) 1507 comment_url = standard.get("commenturl", None) 1508 1509 # set default path 1510 if path is not Absent and path != "": 1511 path_specified = True 1512 path = escape_path(path) 1513 else: 1514 path_specified = False 1515 path = request_path(request) 1516 i = path.rfind("/") 1517 if i != -1: 1518 if version == 0: 1519 # Netscape spec parts company from reality here 1520 path = path[:i] 1521 else: 1522 path = path[:i+1] 1523 if len(path) == 0: path = "/" 1524 1525 # set default domain 1526 domain_specified = domain is not Absent 1527 # but first we have to remember whether it starts with a dot 1528 domain_initial_dot = False 1529 if domain_specified: 1530 domain_initial_dot = bool(domain.startswith(".")) 1531 if domain is Absent: 1532 req_host, erhn = eff_request_host(request) 1533 domain = erhn 1534 elif not domain.startswith("."): 1535 domain = "."+domain 1536 1537 # set default port 1538 port_specified = False 1539 if port is not Absent: 1540 if port is None: 1541 # Port attr present, but has no value: default to request port. 1542 # Cookie should then only be sent back on that port. 1543 port = request_port(request) 1544 else: 1545 port_specified = True 1546 port = re.sub(r"\s+", "", port) 1547 else: 1548 # No port attr present. Cookie can be sent back on any port. 1549 port = None 1550 1551 # set default expires and discard 1552 if expires is Absent: 1553 expires = None 1554 discard = True 1555 elif expires <= self._now: 1556 # Expiry date in past is request to delete cookie. This can't be 1557 # in DefaultCookiePolicy, because can't delete cookies there. 1558 try: 1559 self.clear(domain, path, name) 1560 except KeyError: 1561 pass 1562 _debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1563 domain, path, name) 1564 return None 1565 1566 return Cookie(version, 1567 name, value, 1568 port, port_specified, 1569 domain, domain_specified, domain_initial_dot, 1570 path, path_specified, 1571 secure, 1572 expires, 1573 discard, 1574 comment, 1575 comment_url, 1576 rest) 1577 1578 def _cookies_from_attrs_set(self, attrs_set, request): 1579 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1580 1581 cookies = [] 1582 for tup in cookie_tuples: 1583 cookie = self._cookie_from_cookie_tuple(tup, request) 1584 if cookie: cookies.append(cookie) 1585 return cookies 1586 1587 def _process_rfc2109_cookies(self, cookies): 1588 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) 1589 if rfc2109_as_ns is None: 1590 rfc2109_as_ns = not self._policy.rfc2965 1591 for cookie in cookies: 1592 if cookie.version == 1: 1593 cookie.rfc2109 = True 1594 if rfc2109_as_ns: 1595 # treat 2109 cookies as Netscape cookies rather than 1596 # as RFC2965 cookies 1597 cookie.version = 0 1598 1599 def make_cookies(self, response, request): 1600 """Return sequence of Cookie objects extracted from response object.""" 1601 # get cookie-attributes for RFC 2965 and Netscape protocols 1602 headers = response.info() 1603 rfc2965_hdrs = headers.get_all("Set-Cookie2", []) 1604 ns_hdrs = headers.get_all("Set-Cookie", []) 1605 self._policy._now = self._now = int(time.time()) 1606 1607 rfc2965 = self._policy.rfc2965 1608 netscape = self._policy.netscape 1609 1610 if ((not rfc2965_hdrs and not ns_hdrs) or 1611 (not ns_hdrs and not rfc2965) or 1612 (not rfc2965_hdrs and not netscape) or 1613 (not netscape and not rfc2965)): 1614 return [] # no relevant cookie headers: quick exit 1615 1616 try: 1617 cookies = self._cookies_from_attrs_set( 1618 split_header_words(rfc2965_hdrs), request) 1619 except Exception: 1620 _warn_unhandled_exception() 1621 cookies = [] 1622 1623 if ns_hdrs and netscape: 1624 try: 1625 # RFC 2109 and Netscape cookies 1626 ns_cookies = self._cookies_from_attrs_set( 1627 parse_ns_headers(ns_hdrs), request) 1628 except Exception: 1629 _warn_unhandled_exception() 1630 ns_cookies = [] 1631 self._process_rfc2109_cookies(ns_cookies) 1632 1633 # Look for Netscape cookies (from Set-Cookie headers) that match 1634 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1635 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1636 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1637 # bundled in with the Netscape cookies for this purpose, which is 1638 # reasonable behaviour. 1639 if rfc2965: 1640 lookup = {} 1641 for cookie in cookies: 1642 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1643 1644 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1645 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1646 return key not in lookup 1647 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1648 1649 if ns_cookies: 1650 cookies.extend(ns_cookies) 1651 1652 return cookies 1653 1654 def set_cookie_if_ok(self, cookie, request): 1655 """Set a cookie if policy says it's OK to do so.""" 1656 self._cookies_lock.acquire() 1657 try: 1658 self._policy._now = self._now = int(time.time()) 1659 1660 if self._policy.set_ok(cookie, request): 1661 self.set_cookie(cookie) 1662 1663 1664 finally: 1665 self._cookies_lock.release() 1666 1667 def set_cookie(self, cookie): 1668 """Set a cookie, without checking whether or not it should be set.""" 1669 c = self._cookies 1670 self._cookies_lock.acquire() 1671 try: 1672 if cookie.domain not in c: c[cookie.domain] = {} 1673 c2 = c[cookie.domain] 1674 if cookie.path not in c2: c2[cookie.path] = {} 1675 c3 = c2[cookie.path] 1676 c3[cookie.name] = cookie 1677 finally: 1678 self._cookies_lock.release() 1679 1680 def extract_cookies(self, response, request): 1681 """Extract cookies from response, where allowable given the request.""" 1682 _debug("extract_cookies: %s", response.info()) 1683 self._cookies_lock.acquire() 1684 try: 1685 for cookie in self.make_cookies(response, request): 1686 if self._policy.set_ok(cookie, request): 1687 _debug(" setting cookie: %s", cookie) 1688 self.set_cookie(cookie) 1689 finally: 1690 self._cookies_lock.release() 1691 1692 def clear(self, domain=None, path=None, name=None): 1693 """Clear some cookies. 1694 1695 Invoking this method without arguments will clear all cookies. If 1696 given a single argument, only cookies belonging to that domain will be 1697 removed. If given two arguments, cookies belonging to the specified 1698 path within that domain are removed. If given three arguments, then 1699 the cookie with the specified name, path and domain is removed. 1700 1701 Raises KeyError if no matching cookie exists. 1702 1703 """ 1704 if name is not None: 1705 if (domain is None) or (path is None): 1706 raise ValueError( 1707 "domain and path must be given to remove a cookie by name") 1708 del self._cookies[domain][path][name] 1709 elif path is not None: 1710 if domain is None: 1711 raise ValueError( 1712 "domain must be given to remove cookies by path") 1713 del self._cookies[domain][path] 1714 elif domain is not None: 1715 del self._cookies[domain] 1716 else: 1717 self._cookies = {} 1718 1719 def clear_session_cookies(self): 1720 """Discard all session cookies. 1721 1722 Note that the .save() method won't save session cookies anyway, unless 1723 you ask otherwise by passing a true ignore_discard argument. 1724 1725 """ 1726 self._cookies_lock.acquire() 1727 try: 1728 for cookie in self: 1729 if cookie.discard: 1730 self.clear(cookie.domain, cookie.path, cookie.name) 1731 finally: 1732 self._cookies_lock.release() 1733 1734 def clear_expired_cookies(self): 1735 """Discard all expired cookies. 1736 1737 You probably don't need to call this method: expired cookies are never 1738 sent back to the server (provided you're using DefaultCookiePolicy), 1739 this method is called by CookieJar itself every so often, and the 1740 .save() method won't save expired cookies anyway (unless you ask 1741 otherwise by passing a true ignore_expires argument). 1742 1743 """ 1744 self._cookies_lock.acquire() 1745 try: 1746 now = time.time() 1747 for cookie in self: 1748 if cookie.is_expired(now): 1749 self.clear(cookie.domain, cookie.path, cookie.name) 1750 finally: 1751 self._cookies_lock.release() 1752 1753 def __iter__(self): 1754 return deepvalues(self._cookies) 1755 1756 def __len__(self): 1757 """Return number of contained cookies.""" 1758 i = 0 1759 for cookie in self: i = i + 1 1760 return i 1761 1762 def __repr__(self): 1763 r = [] 1764 for cookie in self: r.append(repr(cookie)) 1765 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1766 1767 def __str__(self): 1768 r = [] 1769 for cookie in self: r.append(str(cookie)) 1770 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1771 1772 1773# derives from OSError for backwards-compatibility with Python 2.4.0 1774class LoadError(OSError): pass 1775 1776class FileCookieJar(CookieJar): 1777 """CookieJar that can be loaded from and saved to a file.""" 1778 1779 def __init__(self, filename=None, delayload=False, policy=None): 1780 """ 1781 Cookies are NOT loaded from the named file until either the .load() or 1782 .revert() method is called. 1783 1784 """ 1785 CookieJar.__init__(self, policy) 1786 if filename is not None: 1787 filename = os.fspath(filename) 1788 self.filename = filename 1789 self.delayload = bool(delayload) 1790 1791 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1792 """Save cookies to a file.""" 1793 raise NotImplementedError() 1794 1795 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1796 """Load cookies from a file.""" 1797 if filename is None: 1798 if self.filename is not None: filename = self.filename 1799 else: raise ValueError(MISSING_FILENAME_TEXT) 1800 1801 with open(filename) as f: 1802 self._really_load(f, filename, ignore_discard, ignore_expires) 1803 1804 def revert(self, filename=None, 1805 ignore_discard=False, ignore_expires=False): 1806 """Clear all cookies and reload cookies from a saved file. 1807 1808 Raises LoadError (or OSError) if reversion is not successful; the 1809 object's state will not be altered if this happens. 1810 1811 """ 1812 if filename is None: 1813 if self.filename is not None: filename = self.filename 1814 else: raise ValueError(MISSING_FILENAME_TEXT) 1815 1816 self._cookies_lock.acquire() 1817 try: 1818 1819 old_state = copy.deepcopy(self._cookies) 1820 self._cookies = {} 1821 try: 1822 self.load(filename, ignore_discard, ignore_expires) 1823 except OSError: 1824 self._cookies = old_state 1825 raise 1826 1827 finally: 1828 self._cookies_lock.release() 1829 1830 1831def lwp_cookie_str(cookie): 1832 """Return string representation of Cookie in the LWP cookie file format. 1833 1834 Actually, the format is extended a bit -- see module docstring. 1835 1836 """ 1837 h = [(cookie.name, cookie.value), 1838 ("path", cookie.path), 1839 ("domain", cookie.domain)] 1840 if cookie.port is not None: h.append(("port", cookie.port)) 1841 if cookie.path_specified: h.append(("path_spec", None)) 1842 if cookie.port_specified: h.append(("port_spec", None)) 1843 if cookie.domain_initial_dot: h.append(("domain_dot", None)) 1844 if cookie.secure: h.append(("secure", None)) 1845 if cookie.expires: h.append(("expires", 1846 time2isoz(float(cookie.expires)))) 1847 if cookie.discard: h.append(("discard", None)) 1848 if cookie.comment: h.append(("comment", cookie.comment)) 1849 if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) 1850 1851 keys = sorted(cookie._rest.keys()) 1852 for k in keys: 1853 h.append((k, str(cookie._rest[k]))) 1854 1855 h.append(("version", str(cookie.version))) 1856 1857 return join_header_words([h]) 1858 1859class LWPCookieJar(FileCookieJar): 1860 """ 1861 The LWPCookieJar saves a sequence of "Set-Cookie3" lines. 1862 "Set-Cookie3" is the format used by the libwww-perl library, not known 1863 to be compatible with any browser, but which is easy to read and 1864 doesn't lose information about RFC 2965 cookies. 1865 1866 Additional methods 1867 1868 as_lwp_str(ignore_discard=True, ignore_expired=True) 1869 1870 """ 1871 1872 def as_lwp_str(self, ignore_discard=True, ignore_expires=True): 1873 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers. 1874 1875 ignore_discard and ignore_expires: see docstring for FileCookieJar.save 1876 1877 """ 1878 now = time.time() 1879 r = [] 1880 for cookie in self: 1881 if not ignore_discard and cookie.discard: 1882 continue 1883 if not ignore_expires and cookie.is_expired(now): 1884 continue 1885 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) 1886 return "\n".join(r+[""]) 1887 1888 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1889 if filename is None: 1890 if self.filename is not None: filename = self.filename 1891 else: raise ValueError(MISSING_FILENAME_TEXT) 1892 1893 with os.fdopen( 1894 os.open(filename, os.O_CREAT | os.O_WRONLY | os.O_TRUNC, 0o600), 1895 'w', 1896 ) as f: 1897 # There really isn't an LWP Cookies 2.0 format, but this indicates 1898 # that there is extra information in here (domain_dot and 1899 # port_spec) while still being compatible with libwww-perl, I hope. 1900 f.write("#LWP-Cookies-2.0\n") 1901 f.write(self.as_lwp_str(ignore_discard, ignore_expires)) 1902 1903 def _really_load(self, f, filename, ignore_discard, ignore_expires): 1904 magic = f.readline() 1905 if not self.magic_re.search(magic): 1906 msg = ("%r does not look like a Set-Cookie3 (LWP) format " 1907 "file" % filename) 1908 raise LoadError(msg) 1909 1910 now = time.time() 1911 1912 header = "Set-Cookie3:" 1913 boolean_attrs = ("port_spec", "path_spec", "domain_dot", 1914 "secure", "discard") 1915 value_attrs = ("version", 1916 "port", "path", "domain", 1917 "expires", 1918 "comment", "commenturl") 1919 1920 try: 1921 while 1: 1922 line = f.readline() 1923 if line == "": break 1924 if not line.startswith(header): 1925 continue 1926 line = line[len(header):].strip() 1927 1928 for data in split_header_words([line]): 1929 name, value = data[0] 1930 standard = {} 1931 rest = {} 1932 for k in boolean_attrs: 1933 standard[k] = False 1934 for k, v in data[1:]: 1935 if k is not None: 1936 lc = k.lower() 1937 else: 1938 lc = None 1939 # don't lose case distinction for unknown fields 1940 if (lc in value_attrs) or (lc in boolean_attrs): 1941 k = lc 1942 if k in boolean_attrs: 1943 if v is None: v = True 1944 standard[k] = v 1945 elif k in value_attrs: 1946 standard[k] = v 1947 else: 1948 rest[k] = v 1949 1950 h = standard.get 1951 expires = h("expires") 1952 discard = h("discard") 1953 if expires is not None: 1954 expires = iso2time(expires) 1955 if expires is None: 1956 discard = True 1957 domain = h("domain") 1958 domain_specified = domain.startswith(".") 1959 c = Cookie(h("version"), name, value, 1960 h("port"), h("port_spec"), 1961 domain, domain_specified, h("domain_dot"), 1962 h("path"), h("path_spec"), 1963 h("secure"), 1964 expires, 1965 discard, 1966 h("comment"), 1967 h("commenturl"), 1968 rest) 1969 if not ignore_discard and c.discard: 1970 continue 1971 if not ignore_expires and c.is_expired(now): 1972 continue 1973 self.set_cookie(c) 1974 except OSError: 1975 raise 1976 except Exception: 1977 _warn_unhandled_exception() 1978 raise LoadError("invalid Set-Cookie3 format file %r: %r" % 1979 (filename, line)) 1980 1981 1982class MozillaCookieJar(FileCookieJar): 1983 """ 1984 1985 WARNING: you may want to backup your browser's cookies file if you use 1986 this class to save cookies. I *think* it works, but there have been 1987 bugs in the past! 1988 1989 This class differs from CookieJar only in the format it uses to save and 1990 load cookies to and from a file. This class uses the Mozilla/Netscape 1991 `cookies.txt' format. curl and lynx use this file format, too. 1992 1993 Don't expect cookies saved while the browser is running to be noticed by 1994 the browser (in fact, Mozilla on unix will overwrite your saved cookies if 1995 you change them on disk while it's running; on Windows, you probably can't 1996 save at all while the browser is running). 1997 1998 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to 1999 Netscape cookies on saving. 2000 2001 In particular, the cookie version and port number information is lost, 2002 together with information about whether or not Path, Port and Discard were 2003 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the 2004 domain as set in the HTTP header started with a dot (yes, I'm aware some 2005 domains in Netscape files start with a dot and some don't -- trust me, you 2006 really don't want to know any more about this). 2007 2008 Note that though Mozilla and Netscape use the same format, they use 2009 slightly different headers. The class saves cookies using the Netscape 2010 header by default (Mozilla can cope with that). 2011 2012 """ 2013 2014 def _really_load(self, f, filename, ignore_discard, ignore_expires): 2015 now = time.time() 2016 2017 if not NETSCAPE_MAGIC_RGX.match(f.readline()): 2018 raise LoadError( 2019 "%r does not look like a Netscape format cookies file" % 2020 filename) 2021 2022 try: 2023 while 1: 2024 line = f.readline() 2025 rest = {} 2026 2027 if line == "": break 2028 2029 # httponly is a cookie flag as defined in rfc6265 2030 # when encoded in a netscape cookie file, 2031 # the line is prepended with "#HttpOnly_" 2032 if line.startswith(HTTPONLY_PREFIX): 2033 rest[HTTPONLY_ATTR] = "" 2034 line = line[len(HTTPONLY_PREFIX):] 2035 2036 # last field may be absent, so keep any trailing tab 2037 if line.endswith("\n"): line = line[:-1] 2038 2039 # skip comments and blank lines XXX what is $ for? 2040 if (line.strip().startswith(("#", "$")) or 2041 line.strip() == ""): 2042 continue 2043 2044 domain, domain_specified, path, secure, expires, name, value = \ 2045 line.split("\t") 2046 secure = (secure == "TRUE") 2047 domain_specified = (domain_specified == "TRUE") 2048 if name == "": 2049 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2050 # with no name, whereas http.cookiejar regards it as a 2051 # cookie with no value. 2052 name = value 2053 value = None 2054 2055 initial_dot = domain.startswith(".") 2056 assert domain_specified == initial_dot 2057 2058 discard = False 2059 if expires == "": 2060 expires = None 2061 discard = True 2062 2063 # assume path_specified is false 2064 c = Cookie(0, name, value, 2065 None, False, 2066 domain, domain_specified, initial_dot, 2067 path, False, 2068 secure, 2069 expires, 2070 discard, 2071 None, 2072 None, 2073 rest) 2074 if not ignore_discard and c.discard: 2075 continue 2076 if not ignore_expires and c.is_expired(now): 2077 continue 2078 self.set_cookie(c) 2079 2080 except OSError: 2081 raise 2082 except Exception: 2083 _warn_unhandled_exception() 2084 raise LoadError("invalid Netscape format cookies file %r: %r" % 2085 (filename, line)) 2086 2087 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 2088 if filename is None: 2089 if self.filename is not None: filename = self.filename 2090 else: raise ValueError(MISSING_FILENAME_TEXT) 2091 2092 with os.fdopen( 2093 os.open(filename, os.O_CREAT | os.O_WRONLY | os.O_TRUNC, 0o600), 2094 'w', 2095 ) as f: 2096 f.write(NETSCAPE_HEADER_TEXT) 2097 now = time.time() 2098 for cookie in self: 2099 domain = cookie.domain 2100 if not ignore_discard and cookie.discard: 2101 continue 2102 if not ignore_expires and cookie.is_expired(now): 2103 continue 2104 if cookie.secure: secure = "TRUE" 2105 else: secure = "FALSE" 2106 if domain.startswith("."): initial_dot = "TRUE" 2107 else: initial_dot = "FALSE" 2108 if cookie.expires is not None: 2109 expires = str(cookie.expires) 2110 else: 2111 expires = "" 2112 if cookie.value is None: 2113 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2114 # with no name, whereas http.cookiejar regards it as a 2115 # cookie with no value. 2116 name = "" 2117 value = cookie.name 2118 else: 2119 name = cookie.name 2120 value = cookie.value 2121 if cookie.has_nonstandard_attr(HTTPONLY_ATTR): 2122 domain = HTTPONLY_PREFIX + domain 2123 f.write( 2124 "\t".join([domain, initial_dot, cookie.path, 2125 secure, expires, name, value])+ 2126 "\n") 2127