diff --git a/qwarc/__init__.py b/qwarc/__init__.py index c3e3320..9e53a96 100644 --- a/qwarc/__init__.py +++ b/qwarc/__init__.py @@ -152,11 +152,6 @@ class Item: def generate(cls): yield from () # Generate no items by default - @classmethod - def _gen(cls): - for x in cls.generate(): - yield (cls.itemType, x, STATUS_TODO) - def add_subitem(self, itemClassOrType, itemValue): if issubclass(itemClassOrType, Item): item = (itemClassOrType.itemType, itemValue) @@ -307,6 +302,18 @@ class QWARC: self._db.isolation_level = None # Transactions are handled manually below. self._db.execute('PRAGMA synchronous = OFF') + cursor = await self.obtain_exclusive_db_lock() + try: + cursor.execute('SELECT name FROM sqlite_master WHERE type = "table" AND name = "items"') + result = cursor.fetchone() + if not result: + self._create_db(cursor) + self._insert_generated_items(cursor) + cursor.execute('COMMIT') + except: + cursor.execute('ROLLBACK') + raise + try: while True: while len(self._tasks) >= self._concurrency: @@ -393,18 +400,15 @@ class QWARC: await self._insert_subitems(item) item.clear_subitems() - def create_db(self): - db = sqlite3.connect(self._dbPath, timeout = 1) - db.execute('PRAGMA synchronous = OFF') - with db: - db.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)') - db.execute('CREATE INDEX items_status_idx ON items (status)') - db.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)') + def _create_db(self, cursor): + cursor.execute('CREATE TABLE items (id INTEGER PRIMARY KEY, type TEXT, value TEXT, status INTEGER)') + cursor.execute('CREATE INDEX items_status_idx ON items (status)') + cursor.execute('CREATE UNIQUE INDEX items_type_value_idx ON items (type, value)') - it = itertools.chain(*(i._gen() for i in self._itemClasses)) + def _insert_generated_items(self, cursor): + it = itertools.chain((cls.itemType, value, STATUS_TODO) for cls in self._itemClasses for value in cls.generate()) while True: values = tuple(itertools.islice(it, 100000)) if not values: break - with db: - db.executemany('INSERT INTO items (type, value, status) VALUES (?, ?, ?)', values) + cursor.executemany('INSERT OR IGNORE INTO items (type, value, status) VALUES (?, ?, ?)', values) diff --git a/qwarc/cli.py b/qwarc/cli.py index 70ab325..18bf790 100644 --- a/qwarc/cli.py +++ b/qwarc/cli.py @@ -78,9 +78,6 @@ def main(): warcSizeLimit = args.warcsplit, warcDedupe = args.warcdedupe, ) - if not os.path.exists(args.database): - a.create_db() - loop = asyncio.get_event_loop() try: loop.run_until_complete(a.run(loop))