Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
354 changes: 223 additions & 131 deletions README.md

Large diffs are not rendered by default.

4,515 changes: 4,372 additions & 143 deletions dist/warcio.js

Large diffs are not rendered by default.

11 changes: 10 additions & 1 deletion dist/warcio.min.js

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
require = require("esm")(module);
module.exports = require("./main.js");
module.exports = {
...require("./main.js")
};

// ensure global Headers object is set for node
/* istanbul ignore next */
Expand Down
2 changes: 2 additions & 0 deletions main.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ export { StatusAndHeadersParser, StatusAndHeaders } from './src/statusandheaders

export { WARCParser } from './src/warcparser';

export { WARCSerializer } from './src/warcserializer';

export { WARCRecord } from './src/warcrecord';

export { Indexer, CDXIndexer } from './src/indexer';
5 changes: 4 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "warcio",
"version": "1.1.3",
"version": "1.2.0",
"main": "index.js",
"module": "main.js",
"license": "Apache-2.0",
Expand Down Expand Up @@ -31,9 +31,12 @@
"cli.js"
],
"dependencies": {
"@peculiar/webcrypto": "^1.1.1",
"esm": "^3.2.25",
"hi-base32": "^0.5.0",
"node-fetch": "^2.6.0",
"pako": "^1.0.11",
"uuid-random": "^1.3.0",
"yargs": "^15.3.1"
},
"scripts": {
Expand Down
2 changes: 1 addition & 1 deletion rollup.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ export default {
format: 'esm',
}
],
plugins: [commonjs(), resolve()]
plugins: [commonjs(), resolve({ browser: true })]
};

19 changes: 11 additions & 8 deletions src/indexer.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@ class BaseIndexer
this.out = out;
}

serialize(result) {
return JSON.stringify(result) + "\n";
}

write(result) {
this.out.write(JSON.stringify(result) + "\n");
this.out.write(this.serialize(result));
}

async run(files) {
Expand All @@ -39,7 +43,6 @@ class BaseIndexer
}
}
}

}

indexRecord(record, parser, filename) {
Expand Down Expand Up @@ -132,11 +135,11 @@ class CDXIndexer extends Indexer

switch (opts.format) {
case "cdxj":
this.write = this.writeCDXJ;
this.serialize = this.serializeCDXJ;
break;

case "cdx":
this.write = this.writeCDX11;
this.serialize = this.serializeCDX11;
break;

case "json":
Expand All @@ -159,22 +162,22 @@ class CDXIndexer extends Indexer
return true;
}

writeCDXJ(result) {
serializeCDXJ(result) {
const { urlkey, timestamp } = result;
delete result.urlkey;
delete result.timestamp;

this.out.write(`${urlkey} ${timestamp} ${JSON.stringify(result)}\n`);
return `${urlkey} ${timestamp} ${JSON.stringify(result)}\n`;
}

writeCDX11(result) {
serializeCDX11(result) {
const value = [];

for (const field of DEFAULT_LEGACY_CDX_FIELDS) {
value.push(result[field] != undefined ? result[field] : "-");
}

this.out.write(value.join(" ") + "\n");
return value.join(" ") + "\n";
}

getField(field, record) {
Expand Down
40 changes: 25 additions & 15 deletions src/readers.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ class BaseAsyncIterReader
return [chunk.slice(0, inx), chunk.slice(inx)];
}

static async readFully(iter) {
const chunks = [];
let size = 0;

for await (const chunk of iter) {
chunks.push(chunk);
size += chunk.byteLength;
}

return BaseAsyncIterReader.concatChunks(chunks, size);
}



getReadableStream() {
const streamIter = this[Symbol.asyncIterator]();

Expand All @@ -62,16 +76,8 @@ class BaseAsyncIterReader
});
}

async readFully() {
const chunks = [];
let size = 0;

for await (const chunk of this) {
chunks.push(chunk);
size += chunk.byteLength;
}

return BaseAsyncIterReader.concatChunks(chunks, size);
readFully() {
return BaseAsyncIterReader.readFully(this);
}

async readline(maxLength = 0) {
Expand Down Expand Up @@ -110,10 +116,10 @@ class AsyncIterReader extends BaseAsyncIterReader {
}
}

this._sourceIter = streamOrIter[Symbol.asyncIterator]();

if (dechunk) {
this._sourceIter = this.dechunk(this._sourceIter);
this._sourceIter = this.dechunk(streamOrIter);
} else {
this._sourceIter = streamOrIter[Symbol.asyncIterator]();
}

this.lastValue = null;
Expand All @@ -134,7 +140,7 @@ class AsyncIterReader extends BaseAsyncIterReader {
}

async* dechunk(source) {
const reader = new AsyncIterReader(source, null);
const reader = (source instanceof AsyncIterReader) ? source : new AsyncIterReader(source, null);

let size = -1;
let first = true;
Expand Down Expand Up @@ -180,8 +186,12 @@ class AsyncIterReader extends BaseAsyncIterReader {
break;

} else {
yield chunk;
first = false;
if (!chunk || size === 0) {
return;
} else {
yield chunk;
}
}
}

Expand Down
12 changes: 11 additions & 1 deletion src/statusandheaders.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const CRLF = new Uint8Array([13, 10]);
const CRLFCRLF = new Uint8Array([13, 10, 13, 10]);


// ===========================================================================
Expand All @@ -17,6 +19,14 @@ class StatusAndHeaders {
return buff.join('\r\n') + '\r\n';
}

async* iterSerialize(encoder) {
yield encoder.encode(this.statusline);
yield CRLF;
for (const [name, value] of this.headers) {
yield encoder.encode(`${name}: ${value}\r\n`);
}
}

_parseResponseStatusLine() {
const parts = splitRemainder(this.statusline, " ", 2);
this._protocol = parts[0];
Expand Down Expand Up @@ -135,4 +145,4 @@ function splitRemainder(str, sep, limit) {


// ===========================================================================
export { StatusAndHeaders, StatusAndHeadersParser };
export { StatusAndHeaders, StatusAndHeadersParser, CRLF, CRLFCRLF };
23 changes: 16 additions & 7 deletions src/warcparser.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { StatusAndHeadersParser } from './statusandheaders';
import { StatusAndHeaders, StatusAndHeadersParser } from './statusandheaders';
import { WARCRecord } from './warcrecord';
import { AsyncIterReader } from './readers';
import { AsyncIterReader, LimitReader } from './readers';


// ===========================================================================
Expand All @@ -11,7 +11,7 @@ class WARCParser
}

static iterRecords(source, options) {
return new WARCParser(source, options);
return new WARCParser(source, options)[Symbol.asyncIterator]();
}

constructor(source, {keepHeadersCase = false, parseHttp = true} = {}) {
Expand Down Expand Up @@ -40,6 +40,10 @@ class WARCParser
}
}

_initRecordReader(warcHeaders) {
return new LimitReader(this._reader, Number(warcHeaders.headers.get("Content-Length") || 0));
}

async parse() {
await this.readToNextRecord();

Expand All @@ -55,7 +59,7 @@ class WARCParser

this._warcHeadersLength = this._reader.getReadOffset();

const record = new WARCRecord({warcHeaders, reader: this._reader});
const record = new WARCRecord({warcHeaders, reader: this._initRecordReader(warcHeaders)});

this._atRecordBoundary = false;
this._record = record;
Expand Down Expand Up @@ -96,9 +100,14 @@ class WARCParser
this._record = null;
}

async _addHttpHeaders(record, headersParser, reader) {
const httpHeaders = await headersParser.parse(reader, {headersClass: this._headersClass});
record._addHttpHeaders(httpHeaders, reader.getReadOffset() - this._warcHeadersLength);
async _addHttpHeaders(record, headersParser) {
const httpHeaders = await headersParser.parse(this._reader, {headersClass: this._headersClass});
record.httpHeaders = httpHeaders;

const headersLen = this._reader.getReadOffset() - this._warcHeadersLength;
if (record.reader.setLimitSkip) {
record.reader.setLimitSkip(record.warcContentLength - headersLen);
}
}
}

Expand Down
Loading