A framework for quick web archiving
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

114 líneas
3.9 KiB

  1. import aiohttp
  2. import aiohttp.client_proto
  3. import aiohttp.connector
  4. import functools
  5. import itertools
  6. import time
  7. # aiohttp does not expose the raw data sent over the wire, so we need to get a bit creative...
  8. # The ResponseHandler handles received data; the writes are done directly on the underlying transport.
  9. # So ResponseHandler is replaced with a class which keeps all received data in a list, and the transport's write method is replaced with one which sends back all written data to the ResponseHandler.
  10. # Because the ResponseHandler instance disappears when the connection is closed (ClientResponse.{_response_eof,close,release}), ClientResponse copies the references to the data objects in the RequestHandler.
  11. # aiohttp also does connection pooling/reuse, so ClientRequest resets the raw data when the request is sent. (This would not work with pipelining, but aiohttp does not support pipelining: https://github.com/aio-libs/aiohttp/issues/1740 )
  12. # This code has been developed for aiohttp version 2.3.10.
  13. #TODO: THERE IS A MEMORY LEAK HERE SOMEWHERE! I spent a whole day trying to find it without success.
  14. class RawData:
  15. def __init__(self):
  16. self.requestTimestamp = None
  17. self.requestData = []
  18. self.responseTimestamp = None
  19. self.responseData = []
  20. class ResponseHandler(aiohttp.client_proto.ResponseHandler):
  21. def __init__(self, *args, **kwargs):
  22. super().__init__(*args, **kwargs)
  23. self.rawData = None
  24. self.remoteAddress = None
  25. def data_received(self, data):
  26. super().data_received(data)
  27. if not data:
  28. return
  29. if self.rawData.responseTimestamp is None:
  30. self.rawData.responseTimestamp = time.time()
  31. self.rawData.responseData.append(data)
  32. def reset_raw_data(self):
  33. self.rawData = RawData()
  34. def make_transport_write(transport, protocol):
  35. transport._real_write = transport.write
  36. def write(self, data):
  37. if protocol.rawData.requestTimestamp is None:
  38. protocol.rawData.requestTimestamp = time.time()
  39. protocol.rawData.requestData.append(data)
  40. self._real_write(data)
  41. return write
  42. class TCPConnector(aiohttp.connector.TCPConnector):
  43. def __init__(self, *args, loop = None, **kwargs):
  44. super().__init__(*args, loop = loop, **kwargs)
  45. self._factory = functools.partial(ResponseHandler, loop = loop)
  46. async def _wrap_create_connection(self, protocolFactory, host, port, *args, **kwargs): #FIXME: Uses internal API
  47. transport, protocol = await super()._wrap_create_connection(protocolFactory, host, port, *args, **kwargs)
  48. transport.write = make_transport_write(transport, protocol).__get__(transport, type(transport)) # https://stackoverflow.com/a/28127947
  49. protocol.remoteAddress = (host, port)
  50. return (transport, protocol)
  51. class ClientRequest(aiohttp.client_reqrep.ClientRequest):
  52. def send(self, connection):
  53. connection.protocol.reset_raw_data()
  54. return super().send(connection)
  55. class ClientResponse(aiohttp.client_reqrep.ClientResponse):
  56. def __init__(self, *args, **kwargs):
  57. super().__init__(*args, **kwargs)
  58. self._rawData = None
  59. self._remoteAddress = None
  60. async def start(self, connection, readUntilEof):
  61. self._rawData = connection.protocol.rawData
  62. self._remoteAddress = connection.protocol.remoteAddress
  63. return (await super().start(connection, readUntilEof))
  64. @property
  65. def rawRequestTimestamp(self):
  66. return self._rawData.requestTimestamp
  67. @property
  68. def rawRequestData(self):
  69. return b''.join(self._rawData.requestData)
  70. @property
  71. def rawResponseTimestamp(self):
  72. return self._rawData.responseTimestamp
  73. @property
  74. def rawResponseData(self):
  75. return b''.join(self._rawData.responseData)
  76. @property
  77. def remoteAddress(self):
  78. return self._remoteAddress
  79. def set_history(self, history):
  80. self._history = history #FIXME: Uses private attribute of aiohttp.client_reqrep.ClientResponse
  81. def iter_all(self):
  82. return itertools.chain(self.history, (self,))
  83. async def release(self):
  84. if not self.closed:
  85. self.connection.reset_raw_data()
  86. await super().release()