From eb617b0f4a598ab111767ec92d26cd1fa56a50be Mon Sep 17 00:00:00 2001 From: Hande Celikkanat <7702228+handecelikkanat@users.noreply.github.com> Date: Thu, 9 Apr 2026 17:45:43 +0300 Subject: [PATCH 1/6] feat: add direct remote access over s3 and https via warcio >= 1.8.0 --- Makefile | 27 +++++++++++++++++++++++++++ README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- warcio-iterator.py | 3 ++- 4 files changed, 75 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 9e89d08..dd68f82 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,33 @@ iterate: python ./warcio-iterator.py whirlwind.warc.wat.gz @echo +#FIXME: Update s3 locations if moved to public bucket: +iterate-remote-s3: + @echo iterating over remote warcs over https: + @echo + @echo warc: + python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.gz + @echo + @echo wet: + python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wet.gz + @echo + @echo wat: + python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wat.gz + + +#FIXME: We need the example files on public s3 bucket for this: +#iterate-remote-https: +# @echo iterating over remote warcs over https: +# @echo +# @echo warc: +# python ./warcio-iterator.py https://data.commoncrawl.org//whirlwind.warc.gz +# @echo +# @echo wet: +# python ./warcio-iterator.py https://data.commoncrawl.org//whirlwind.warc.wet.gz +# @echo +# @echo wat: +# python ./warcio-iterator.py https://data.commoncrawl.org//whirlwind.warc.wat.gz + cdxj: @echo "creating *.cdxj index files from the local warcs" cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj diff --git a/README.md b/README.md index 17a0a5a..1c7e544 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,51 @@ python ./warcio-iterator.py whirlwind.warc.wat.gz The output has three sections, one each for the WARC, WET, and WAT. For each one, it prints the record types we saw before, plus the `WARC-Target-URI` for those record types that have it. +### Task 2-i: Iterating over "Remote" Files +So far we've been working with small local WARC files. But Common Crawl's real WARC files live on AWS S3. Since warcio 1.8, you can iterate over remote files exactly the same way as local ones — no download step required. We can do this over HTTPS or S3. + +If you have AWS credentials configured, you can stream directly from S3, which is faster if you're running on AWS. Although the S3 bucket is public, but S3 access still requires AWS credentials. + +`make iterate-remote-s3` + +
+ Click to view output +``` +iterating over remote warcs over s3: + +warc: +python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.gz + WARC-Type: warcinfo + WARC-Type: request + WARC-Target-URI https://an.wikipedia.org/wiki/Escopete + WARC-Type: response + WARC-Target-URI https://an.wikipedia.org/wiki/Escopete + WARC-Type: metadata + WARC-Target-URI https://an.wikipedia.org/wiki/Escopete + +wet: +python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wet.gz + WARC-Type: warcinfo + WARC-Type: conversion + WARC-Target-URI https://an.wikipedia.org/wiki/Escopete + +wat: +python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wat.gz + WARC-Type: warcinfo + WARC-Type: metadata + WARC-Target-URI https://an.wikipedia.org/wiki/Escopete +``` +
+ + +If you don't have credentials configured, the HTTPS version works without any authentication. + +`make iterate-remote-https` + +
+ Click to view output +
+ ## Task 3: Index the WARC, WET, and WAT The example WARC files we've been using are tiny and easy to work with. The real WARC files are around a gigabyte in size and contain about 30,000 webpages each. What's more, we have around 24 million of these files! To read all of them, we could iterate, but what if we wanted random access so we could read just one particular record? We do that with an index. diff --git a/requirements.txt b/requirements.txt index 740b2f9..8875438 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -warcio +warcio[s3]>=1.8.0 cdx_toolkit duckdb pyarrow diff --git a/warcio-iterator.py b/warcio-iterator.py index 14a69bc..efbfb1c 100644 --- a/warcio-iterator.py +++ b/warcio-iterator.py @@ -2,10 +2,11 @@ import sys +from warcio.utils import fsspec_open from warcio.archiveiterator import ArchiveIterator for file in sys.argv[1:]: - with open(file, 'rb') as stream: + with fsspec_open(file, 'rb') as stream: for record in ArchiveIterator(stream): print(' ', 'WARC-Type:', record.rec_type) if record.rec_type in {'request', 'response', 'conversion', 'metadata'}: From ab23a9c61f5b622ad3fdfe101c91d6f41465d331 Mon Sep 17 00:00:00 2001 From: Hande Celikkanat <7702228+handecelikkanat@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:47:45 +0300 Subject: [PATCH 2/6] fix(warcio-iterator.py): use fsspec.open instead of warcio.ffspec_open, on malte's review --- warcio-iterator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warcio-iterator.py b/warcio-iterator.py index efbfb1c..547d038 100644 --- a/warcio-iterator.py +++ b/warcio-iterator.py @@ -2,11 +2,11 @@ import sys -from warcio.utils import fsspec_open +import fsspec from warcio.archiveiterator import ArchiveIterator for file in sys.argv[1:]: - with fsspec_open(file, 'rb') as stream: + with fsspec.open(file, 'rb') as stream: for record in ArchiveIterator(stream): print(' ', 'WARC-Type:', record.rec_type) if record.rec_type in {'request', 'response', 'conversion', 'metadata'}: From 7d99aee5599cb6e60e636cb0076368868762054e Mon Sep 17 00:00:00 2001 From: Hande Celikkanat <7702228+handecelikkanat@users.noreply.github.com> Date: Sun, 12 Apr 2026 19:06:43 +0300 Subject: [PATCH 3/6] fix(README.md): remove remote access details from Task 2 (to be moved to Task 3) --- README.md | 38 ++++++++------------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 1c7e544..cd7b786 100644 --- a/README.md +++ b/README.md @@ -174,20 +174,17 @@ python ./warcio-iterator.py whirlwind.warc.wat.gz The output has three sections, one each for the WARC, WET, and WAT. For each one, it prints the record types we saw before, plus the `WARC-Target-URI` for those record types that have it. -### Task 2-i: Iterating over "Remote" Files -So far we've been working with small local WARC files. But Common Crawl's real WARC files live on AWS S3. Since warcio 1.8, you can iterate over remote files exactly the same way as local ones — no download step required. We can do this over HTTPS or S3. +warcio also supports working on remote files, so let us try the same command on the remote version of the same WARC file we just iterated locally. We will reach this remote file from the Github repository for this tutorial: -If you have AWS credentials configured, you can stream directly from S3, which is faster if you're running on AWS. Although the S3 bucket is public, but S3 access still requires AWS credentials. - -`make iterate-remote-s3` +`make iterate-remote` +
+ Click to view code +python ./warcio-iterator.py https://raw.githubusercontent.com/commoncrawl/whirlwind-python/refs/heads/main/whirlwind.warc.gz +
+The output should be identical to what you saw from the local file:
Click to view output -``` -iterating over remote warcs over s3: - -warc: -python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.gz WARC-Type: warcinfo WARC-Type: request WARC-Target-URI https://an.wikipedia.org/wiki/Escopete @@ -195,29 +192,10 @@ python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwi WARC-Target-URI https://an.wikipedia.org/wiki/Escopete WARC-Type: metadata WARC-Target-URI https://an.wikipedia.org/wiki/Escopete - -wet: -python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wet.gz - WARC-Type: warcinfo - WARC-Type: conversion - WARC-Target-URI https://an.wikipedia.org/wiki/Escopete - -wat: -python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wat.gz - WARC-Type: warcinfo - WARC-Type: metadata - WARC-Target-URI https://an.wikipedia.org/wiki/Escopete -```
+We get the same output, but this time by streaming the file over HTTPS instead of reading from local disk. Later in the tour, we will use this capability to index and extract records from remote WARC files hosted on AWS S3 buckets. -If you don't have credentials configured, the HTTPS version works without any authentication. - -`make iterate-remote-https` - -
- Click to view output -
## Task 3: Index the WARC, WET, and WAT From bfb84b2dd889d3534af1673b32ca3c0acb1b06c9 Mon Sep 17 00:00:00 2001 From: Hande Celikkanat <7702228+handecelikkanat@users.noreply.github.com> Date: Sun, 12 Apr 2026 20:26:38 +0300 Subject: [PATCH 4/6] feat: add remote cdxj-indexing and warcio-extracting of EoT WARCs over s3, https --- Makefile | 49 ++++++++++++++++++++---------------------- README.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 30 deletions(-) diff --git a/Makefile b/Makefile index dd68f82..fd742b7 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,9 @@ +EOT_IA_WARC_HTTPS = https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz +EOT_IA_WARC_S3 = s3://eotarchive/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz +EOT_CC_WARC_HTTPS = https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz +EOT_CC_WARC_S3 = s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz +WHIRLWIND_WARC_HTTPS = https://raw.githubusercontent.com/commoncrawl/whirlwind-python/refs/heads/main/whirlwind.warc.gz + venv: @echo "making a venv in ~/venv/whirlwind" mkdir -p ~/venv @@ -22,32 +28,9 @@ iterate: python ./warcio-iterator.py whirlwind.warc.wat.gz @echo -#FIXME: Update s3 locations if moved to public bucket: -iterate-remote-s3: - @echo iterating over remote warcs over https: - @echo - @echo warc: - python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.gz - @echo - @echo wet: - python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wet.gz - @echo - @echo wat: - python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wat.gz - - -#FIXME: We need the example files on public s3 bucket for this: -#iterate-remote-https: -# @echo iterating over remote warcs over https: -# @echo -# @echo warc: -# python ./warcio-iterator.py https://data.commoncrawl.org//whirlwind.warc.gz -# @echo -# @echo wet: -# python ./warcio-iterator.py https://data.commoncrawl.org//whirlwind.warc.wet.gz -# @echo -# @echo wat: -# python ./warcio-iterator.py https://data.commoncrawl.org//whirlwind.warc.wat.gz +iterate-remote: + @echo "iterating over whirlwind.warc.gz from GitHub via HTTPS:" + python ./warcio-iterator.py $(WHIRLWIND_WARC_HTTPS) cdxj: @echo "creating *.cdxj index files from the local warcs" @@ -55,6 +38,13 @@ cdxj: cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj +cdxj-remote: + @echo "indexing End-of-Term-2024 Internet Archive WARC over HTTPS (File size ~1GB, showing first 10 records):" + cdxj-indexer $(EOT_IA_WARC_HTTPS) 2>/dev/null | head -n 10 | tee eot-ia.cdxj + @echo + @echo "indexing End-of-Term-2024 Common Crawl repackage WARC over S3 (File size ~1GB, showing first 10 records):" + cdxj-indexer $(EOT_CC_WARC_S3) 2>/dev/null | head -n 10 | tee eot-cc.cdxj + extract: @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index" warcio extract --payload whirlwind.warc.gz 1023 > extraction.html @@ -62,6 +52,13 @@ extract: warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json @echo "hint: python -m json.tool extraction.json" +extract-remote: + @echo "extracting hpxml.nrel.gov record from End-of-Term Internet Archive WARC over HTTPS (offset 50755):" + warcio extract $(EOT_IA_WARC_HTTPS) 50755 + @echo + @echo "extracting before-you-ship.18f.gov record from End-of-Term Common Crawl repackage WARC over S3 (offset 18595):" + warcio extract $(EOT_CC_WARC_S3) 18595 + cdx_toolkit: @echo demonstrate that we have this entry in the index cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete diff --git a/README.md b/README.md index cd7b786..cbe115b 100644 --- a/README.md +++ b/README.md @@ -174,17 +174,26 @@ python ./warcio-iterator.py whirlwind.warc.wat.gz The output has three sections, one each for the WARC, WET, and WAT. For each one, it prints the record types we saw before, plus the `WARC-Target-URI` for those record types that have it. +### Iterating over remote files + warcio also supports working on remote files, so let us try the same command on the remote version of the same WARC file we just iterated locally. We will reach this remote file from the Github repository for this tutorial: `make iterate-remote`
Click to view code + +``` python ./warcio-iterator.py https://raw.githubusercontent.com/commoncrawl/whirlwind-python/refs/heads/main/whirlwind.warc.gz +```
+ The output should be identical to what you saw from the local file: +
Click to view output + +``` WARC-Type: warcinfo WARC-Type: request WARC-Target-URI https://an.wikipedia.org/wiki/Escopete @@ -192,9 +201,10 @@ The output should be identical to what you saw from the local file: WARC-Target-URI https://an.wikipedia.org/wiki/Escopete WARC-Type: metadata WARC-Target-URI https://an.wikipedia.org/wiki/Escopete +```
-We get the same output, but this time by streaming the file over HTTPS instead of reading from local disk. Later in the tour, we will use this capability to index and extract records from remote WARC files hosted on AWS S3 buckets. +We got the same output, but this time by streaming the file over HTTPS instead of reading from local disk. Later in the tour, we will use this capability to index and extract records from remote WARC files hosted on AWS S3 buckets. ## Task 3: Index the WARC, WET, and WAT @@ -217,7 +227,7 @@ The CDX index files are sorted plain-text files, with each line containing infor We can create our own CDXJ index from the local WARCs by running: -```make cdxj``` +`make cdxj` This uses the [cdxj-indexer](https://github.com/webrecorder/cdxj-indexer) library to generate CDXJ index files for our WARC files by running the code below: @@ -239,7 +249,28 @@ For each of these records, there's one text line in the index - yes, it's a flat What is the purpose of this funky format? It's done this way because these flat files (300 gigabytes total per crawl) can be sorted on the primary key using any out-of-core sort utility e.g. the standard Linux `sort`, or one of the Hadoop-based out-of-core sort functions. -The JSON blob has enough information to cleanly isolate the raw data of a single record: it defines which WARC file the record is in, and the byte offset and length of the record within this file. We'll use that in the next section. +The JSON blob has enough information to cleanly isolate the raw data of a single record: it defines which WARC file the record is in, and the byte offset and length of the record within this file. We'll use that in Task 4, when accessing the contents of the WARC at this offset. But first, let's observe that we could do all of this processing over a remote file location, as before. + + +### Indexing remote WARC files + +Through warcio's remote file handling capabilities, `cdxj-indexer` too can work on remote files, and this is true not just Common Crawl's, but any WARC files accessible over HTTPS or S3. As an example, let us check two WARC files from the End-of-Term Web Archive, which preserves U.S. government websites around presidential transitions. We will check one WARC file crawled by the Internet Archive (in the IA-000 segment), and another one repackaged from Common Crawl data (in the CC-000 segment). Let's index a few records from each. + +Run: + +`make cdxj-remote` + +
+ Click to view code + +``` +cdxj-indexer https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz 2>/dev/null | head -n 10 | tee eot-ia.cdxj +cdxj-indexer s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz 2>/dev/null | head -n 10 | tee eot-cc.cdxj +``` +
+ +The first command fetches and indexes a WARC over HTTPS, the second over S3. These real-life WARC files are around 1GB each, so we display and save only the first 10 records. + ## Task 4: Use the CDXJ index to extract a subset of raw content from the local WARC, WET, and WAT @@ -256,7 +287,7 @@ and lengths for every record. Let's extract some individual records from our warc.gz files. Run: -```make extract``` +`make extract` to run a set of extractions from your local `whirlwind.*.gz` files with `warcio` using the code below: @@ -279,6 +310,31 @@ ones as in the index. Look at the three output files: `extraction.html`, `extrac Notice that we extracted HTML from the WARC, text from WET, and JSON from the WAT (as shown in the different file extensions). This is because the payload in each file type is formatted differently! + +### Extracting from remote WARC files + +The same random access trick works on remote files. By indexing deeper into the EOT WARC files from Task 3 (try increasing the head count, or removing it entirely if you're patient), we can find offsets for specific records and extract them directly — without downloading the entire file. + +Run: + +`make extract-remote` + +
+ Click to view code + +``` +warcio extract https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz 50755 +warcio extract s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz 18595 +``` +
+ +The first command extracts the record for `https://hpxml.nrel.gov/` (HPXML Toolbox, hosted by the National Renewable Energy Laboratory) from an Internet Archive crawl, fetched over HTTPS. The second extracts the record for `https://before-you-ship.18f.gov/` (18F's pre-launch checklist for government services) from a Common Crawl repackage, fetched over S3. + +In both cases, warcio uses the byte offset to seek directly to the right position in the remote file and decompress just that one record. Later in this tutorial we will see the same mechanism being used by `cdx_toolkit` to fetch a specific capture, by looking up the offset in the CDX index, then make a byte-range request to retrieve just the record you want. + +**Note:** If you look at the output of the second extraction (`before-you-ship.18f.gov`), you'll notice that despite having an HTTP 200 status in the index, the actual HTML content is just a redirect page pointing to `handbook.tts.gsa.gov`. This is a good reminder that real crawl data is messy, a 200 status in the index doesn't always mean you'll get a full page of content! + + ## Task 5: Wreck the WARC by compressing it wrong As mentioned earlier, WARC/WET/WAT files look like they're normal gzipped files, but they're actually gzipped in a particular way that allows random access. From 157eeec5ddcf80f6adf86122fbc47b4e76de4549 Mon Sep 17 00:00:00 2001 From: Hande Celikkanat <7702228+handecelikkanat@users.noreply.github.com> Date: Mon, 13 Apr 2026 12:17:32 +0300 Subject: [PATCH 5/6] feat: add CI tests for make targets: iterate-remote, cdxj-remote, extract-remote --- .github/workflows/ci.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4bdc673..1932b56 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -57,12 +57,21 @@ jobs: - name: make iterate run: make iterate + - name: make iterate-remote + run: make iterate-remote + - name: make cdxj run: make cdxj + - name: make cdxj-remote + run: make cdxj-remote + - name: make extract run: make extract + - name: make extract-remote + run: make extract-remote + - name: make cdx_toolkit run: make cdx_toolkit From 32ba031b59f49d52269b2c2e0f297c74917a0219 Mon Sep 17 00:00:00 2001 From: Hande Celikkanat <7702228+handecelikkanat@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:06:28 +0300 Subject: [PATCH 6/6] fix: divide remote into remote-https and remote-s3. add ci tests only for https. --- .github/workflows/ci.yaml | 8 +++---- Makefile | 22 ++++++++++++++++--- README.md | 45 +++++++++++++++++++++++++++------------ 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1932b56..d849fa6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -63,14 +63,14 @@ jobs: - name: make cdxj run: make cdxj - - name: make cdxj-remote - run: make cdxj-remote + - name: make cdxj-remote-https + run: make cdxj-remote-https - name: make extract run: make extract - - name: make extract-remote - run: make extract-remote + - name: make extract-remote-https + run: make extract-remote-https - name: make cdx_toolkit run: make cdx_toolkit diff --git a/Makefile b/Makefile index fd742b7..3995e53 100644 --- a/Makefile +++ b/Makefile @@ -38,10 +38,18 @@ cdxj: cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj -cdxj-remote: +cdxj-remote-https: @echo "indexing End-of-Term-2024 Internet Archive WARC over HTTPS (File size ~1GB, showing first 10 records):" cdxj-indexer $(EOT_IA_WARC_HTTPS) 2>/dev/null | head -n 10 | tee eot-ia.cdxj @echo + @echo "indexing End-of-Term-2024 Common Crawl repackage WARC over HTTPS (File size ~1GB, showing first 10 records):" + cdxj-indexer $(EOT_CC_WARC_HTTPS) 2>/dev/null | head -n 10 | tee eot-cc.cdxj + +cdxj-remote-s3: + @echo "!! this step requires authentication via S3 credentials (even though it is free)" + @echo "indexing End-of-Term-2024 Internet Archive WARC over S3 (File size ~1GB, showing first 10 records):" + cdxj-indexer $(EOT_IA_WARC_S3) 2>/dev/null | head -n 10 | tee eot-ia.cdxj + @echo @echo "indexing End-of-Term-2024 Common Crawl repackage WARC over S3 (File size ~1GB, showing first 10 records):" cdxj-indexer $(EOT_CC_WARC_S3) 2>/dev/null | head -n 10 | tee eot-cc.cdxj @@ -52,10 +60,18 @@ extract: warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json @echo "hint: python -m json.tool extraction.json" -extract-remote: +extract-remote-https: @echo "extracting hpxml.nrel.gov record from End-of-Term Internet Archive WARC over HTTPS (offset 50755):" warcio extract $(EOT_IA_WARC_HTTPS) 50755 @echo + @echo "extracting before-you-ship.18f.gov record from End-of-Term Common Crawl repackage WARC over HTTPS (offset 18595):" + warcio extract $(EOT_CC_WARC_HTTPS) 18595 + +extract-remote-s3: + @echo "!! this step requires authentication via S3 credentials (even though it is free)" + @echo "extracting hpxml.nrel.gov record from End-of-Term Internet Archive WARC over S3 (offset 50755):" + warcio extract $(EOT_IA_WARC_S3) 50755 + @echo @echo "extracting before-you-ship.18f.gov record from End-of-Term Common Crawl repackage WARC over S3 (offset 18595):" warcio extract $(EOT_CC_WARC_S3) 18595 @@ -81,7 +97,7 @@ download_collinfo: curl -O https://index.commoncrawl.org/collinfo.json CC-MAIN-2024-22.warc.paths.gz: - @echo "downloading the list from S3 requires S3 auth (even though it is free)" + @echo "!! this step requires authentication via S3 credentials (even though it is free)" @echo "note that this file should already be in the repo" aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz diff --git a/README.md b/README.md index cbe115b..960ffae 100644 --- a/README.md +++ b/README.md @@ -257,19 +257,34 @@ The JSON blob has enough information to cleanly isolate the raw data of a single Through warcio's remote file handling capabilities, `cdxj-indexer` too can work on remote files, and this is true not just Common Crawl's, but any WARC files accessible over HTTPS or S3. As an example, let us check two WARC files from the End-of-Term Web Archive, which preserves U.S. government websites around presidential transitions. We will check one WARC file crawled by the Internet Archive (in the IA-000 segment), and another one repackaged from Common Crawl data (in the CC-000 segment). Let's index a few records from each. Run: - -`make cdxj-remote` +`make cdxj-remote-https`
Click to view code ``` cdxj-indexer https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz 2>/dev/null | head -n 10 | tee eot-ia.cdxj -cdxj-indexer s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz 2>/dev/null | head -n 10 | tee eot-cc.cdxj +cdxj-indexer https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz 2>/dev/null | head -n 10 | tee eot-cc.cdxj ```
-The first command fetches and indexes a WARC over HTTPS, the second over S3. These real-life WARC files are around 1GB each, so we display and save only the first 10 records. +The first command fetches and indexes these two WARC over HTTPS. Since they are both around 1GB each, so we display and save only the first 10 records. + +If you have AWS credentials configured, you can also access the same files over S3, which is faster when running on AWS. Even though you will need AWS credentials for authentication purposes, this process is still free of charge since these are public buckets. +If you do not have AWS credentials, you can access the same information over HTTPS as described above. + +Run: + +`make cdxj-remote-s3` + +
+ Click to view code + +``` +cdxj-indexer s3://eotarchive/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz 2>/dev/null | head -n 10 | tee eot-ia.cdxj +cdxj-indexer s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz 2>/dev/null | head -n 10 | tee eot-cc.cdxj +``` +
## Task 4: Use the CDXJ index to extract a subset of raw content from the local WARC, WET, and WAT @@ -316,24 +331,26 @@ Notice that we extracted HTML from the WARC, text from WET, and JSON from the WA The same random access trick works on remote files. By indexing deeper into the EOT WARC files from Task 3 (try increasing the head count, or removing it entirely if you're patient), we can find offsets for specific records and extract them directly — without downloading the entire file. Run: - -`make extract-remote` +`make extract-remote-https`
Click to view code ``` warcio extract https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz 50755 -warcio extract s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz 18595 +warcio extract https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz 18595 ```
-The first command extracts the record for `https://hpxml.nrel.gov/` (HPXML Toolbox, hosted by the National Renewable Energy Laboratory) from an Internet Archive crawl, fetched over HTTPS. The second extracts the record for `https://before-you-ship.18f.gov/` (18F's pre-launch checklist for government services) from a Common Crawl repackage, fetched over S3. +The first command extracts the record for https://hpxml.nrel.gov/ (HPXML Toolbox, hosted by the National Renewable Energy Laboratory) from an Internet Archive crawl. The second extracts the record for https://before-you-ship.18f.gov/ (18F's pre-launch checklist for government services) from a Common Crawl repackage. + +As with indexing, you can also use S3 paths if you have AWS credentials configured: -In both cases, warcio uses the byte offset to seek directly to the right position in the remote file and decompress just that one record. Later in this tutorial we will see the same mechanism being used by `cdx_toolkit` to fetch a specific capture, by looking up the offset in the CDX index, then make a byte-range request to retrieve just the record you want. +`make extract-remote-s3` -**Note:** If you look at the output of the second extraction (`before-you-ship.18f.gov`), you'll notice that despite having an HTTP 200 status in the index, the actual HTML content is just a redirect page pointing to `handbook.tts.gsa.gov`. This is a good reminder that real crawl data is messy, a 200 status in the index doesn't always mean you'll get a full page of content! +In both cases, warcio uses the byte offset to seek directly to the right position in the remote file and decompress just that one record. Later in this tutorial we will see the same mechanism being used by `cdx_toolkit` to fetch a specific capture, by looking up the offset in the CDX index, then making a byte-range request to retrieve just the record you want. +**Note:** If you look at the output of the second extraction (before-you-ship.18f.gov), you'll notice that despite having an HTTP 200 status in the index, the actual HTML content is just a redirect page pointing to handbook.tts.gsa.gov. This is a good reminder that real crawl data is messy — a 200 status in the index doesn't always mean you'll get a full page of content! ## Task 5: Wreck the WARC by compressing it wrong @@ -472,7 +489,7 @@ We check for capture results using the `cdxt` command `iter`, specifying the exa #### Retrieve the fetched content as WARC Next, we use the `cdxt` command `warc` to retrieve the content and save it locally as a new WARC file, again specifying the exact URL, crawl identifier, and timestamp range. This creates the WARC file `TEST-000000.extracted.warc.gz` which contains a `warcinfo` record explaining what the WARC is, followed by the `response` record we requested. -* If you dig into cdx_toolkit's code, you'll find that it is using the offset and length of the WARC record (as returned by the CDX index query) to make a HTTP byte range request to S3 that isolates and returns just the single record we want from the full file. It only downloads the response WARC record because our CDX index only has the response records indexed. +* If you dig into `cdx_toolkit`'s code, you'll find that it is using the offset and length of the WARC record (as returned by the CDX index query) to make a HTTP byte range request to S3 that isolates and returns just the single record we want from the full file. It only downloads the response WARC record because our CDX index only has the response records indexed. * By default `cdxt` avoids overwriting existing files by automatically incrementing the counter in the filename. If you run this again without deleting `TEST-000000.extracted.warc.gz`, the data will be written again to a new file `TEST-000001.extracted.warc.gz`. * Limit, timestamp, and crawl index args, as well as URL wildcards, work as for `iter`. @@ -486,7 +503,7 @@ Now let's look at the columnar index, the other kind of index that Common Crawl We could read the data directly from our index in our S3 bucket and analyse it in the cloud through AWS Athena. However, this is a managed service that costs money to use (though usually a small amount). [You can read about using it here.](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format) This whirlwind tour will only use the free method of either fetching data from outside of AWS (which is kind of slow), or making a local copy of a single columnar index (300 gigabytes per monthly crawl), and then using that. -The columnar index is divided up into a separate index per crawl, which Athena or duckdb can stitch together. The cdx index is similarly divided up, but cdx_toolkit hides that detail from you. +The columnar index is divided up into a separate index per crawl, which Athena or duckdb can stitch together. The cdx index is similarly divided up, but `cdx_toolkit` hides that detail from you. For the purposes of this whirlwind tour, we don't want to configure all the crawl indices because it would be slow. So let's start by figuring out which crawl was ongoing on the date 20240518015810, and then we'll work with just that one crawl. @@ -640,8 +657,8 @@ All of these scripts run the same SQL query and should return the same record (w 1. Use the DuckDb techniques from [Task 8](#task-8-query-using-the-columnar-index--duckdb-from-outside-aws) and the [Index Server](https://index.commoncrawl.org) to find a new webpage in the archives. 2. Note its url, warc, and timestamp. -3. Now open up the Makefile from [Task 6](#task-6-use-cdx_toolkit-to-query-the-full-cdx-index-and-download-those-captures-from-aws-s3) and look at the actions from the cdx_toolkit section. -4. Repeat the cdx_toolkit steps, but for the page and date range you found above. +3. Now open up the Makefile from [Task 6](#task-6-use-cdx_toolkit-to-query-the-full-cdx-index-and-download-those-captures-from-aws-s3) and look at the actions from the `cdx_toolkit` section. +4. Repeat the `cdx_toolkit` steps, but for the page and date range you found above. ## Congratulations!