A framework for quick web archiving
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

114 lignes
3.9 KiB

  1. import aiohttp
  2. import aiohttp.client_proto
  3. import aiohttp.connector
  4. import functools
  5. import itertools
  6. import time
  7. # aiohttp does not expose the raw data sent over the wire, so we need to get a bit creative...
  8. # The ResponseHandler handles received data; the writes are done directly on the underlying transport.
  9. # So ResponseHandler is replaced with a class which keeps all received data in a list, and the transport's write method is replaced with one which sends back all written data to the ResponseHandler.
  10. # Because the ResponseHandler instance disappears when the connection is closed (ClientResponse.{_response_eof,close,release}), ClientResponse copies the references to the data objects in the RequestHandler.
  11. # aiohttp also does connection pooling/reuse, so ClientRequest resets the raw data when the request is sent. (This would not work with pipelining, but aiohttp does not support pipelining: https://github.com/aio-libs/aiohttp/issues/1740 )
  12. # This code has been developed for aiohttp version 2.3.10.
  13. #TODO: THERE IS A MEMORY LEAK HERE SOMEWHERE! I spent a whole day trying to find it without success.
  14. class RawData:
  15. def __init__(self):
  16. self.requestTimestamp = None
  17. self.requestData = []
  18. self.responseTimestamp = None
  19. self.responseData = []
  20. class ResponseHandler(aiohttp.client_proto.ResponseHandler):
  21. def __init__(self, *args, **kwargs):
  22. super().__init__(*args, **kwargs)
  23. self.rawData = None
  24. self.remoteAddress = None
  25. def data_received(self, data):
  26. super().data_received(data)
  27. if not data:
  28. return
  29. if self.rawData.responseTimestamp is None:
  30. self.rawData.responseTimestamp = time.time()
  31. self.rawData.responseData.append(data)
  32. def reset_raw_data(self):
  33. self.rawData = RawData()
  34. def make_transport_write(transport, protocol):
  35. transport._real_write = transport.write
  36. def write(self, data):
  37. if protocol.rawData.requestTimestamp is None:
  38. protocol.rawData.requestTimestamp = time.time()
  39. protocol.rawData.requestData.append(data)
  40. self._real_write(data)
  41. return write
  42. class TCPConnector(aiohttp.connector.TCPConnector):
  43. def __init__(self, *args, loop = None, **kwargs):
  44. super().__init__(*args, loop = loop, **kwargs)
  45. self._factory = functools.partial(ResponseHandler, loop = loop)
  46. async def _wrap_create_connection(self, protocolFactory, host, port, *args, **kwargs): #FIXME: Uses internal API
  47. transport, protocol = await super()._wrap_create_connection(protocolFactory, host, port, *args, **kwargs)
  48. transport.write = make_transport_write(transport, protocol).__get__(transport, type(transport)) # https://stackoverflow.com/a/28127947
  49. protocol.remoteAddress = (host, port)
  50. return (transport, protocol)
  51. class ClientRequest(aiohttp.client_reqrep.ClientRequest):
  52. def send(self, connection):
  53. connection.protocol.reset_raw_data()
  54. return super().send(connection)
  55. class ClientResponse(aiohttp.client_reqrep.ClientResponse):
  56. def __init__(self, *args, **kwargs):
  57. super().__init__(*args, **kwargs)
  58. self._rawData = None
  59. self._remoteAddress = None
  60. async def start(self, connection, readUntilEof):
  61. self._rawData = connection.protocol.rawData
  62. self._remoteAddress = connection.protocol.remoteAddress
  63. return (await super().start(connection, readUntilEof))
  64. @property
  65. def rawRequestTimestamp(self):
  66. return self._rawData.requestTimestamp
  67. @property
  68. def rawRequestData(self):
  69. return b''.join(self._rawData.requestData)
  70. @property
  71. def rawResponseTimestamp(self):
  72. return self._rawData.responseTimestamp
  73. @property
  74. def rawResponseData(self):
  75. return b''.join(self._rawData.responseData)
  76. @property
  77. def remoteAddress(self):
  78. return self._remoteAddress
  79. def set_history(self, history):
  80. self._history = history #FIXME: Uses private attribute of aiohttp.client_reqrep.ClientResponse
  81. def iter_all(self):
  82. return itertools.chain(self.history, (self,))
  83. async def release(self):
  84. if not self.closed:
  85. self.connection.reset_raw_data()
  86. await super().release()