Enable cryptokitties data from nonfungible.com (#1680)

* enable cryptokitties Recall that nonfungible.com sent us an initial data dump so that we wouldn't have to pull everything from the API. And recall that we're storing that initial dump on S3. Finally, recall that the cryptokitties data was particularly voluminous (~1GB), which doesn't easily transfer to and from S3. These changes provide a CLI script to partition that data (or any such data from nonfungible.com) into a set of smaller files; and a scraping script that seamlessly recombines those partitions for loading into the database. * add chainbreakers, chibifighters & mlbcryptobaseball
2019-03-15 10:50:59 -04:00
parent 7f5a3f12ca
commit 4bf311a282
2 changed files with 183 additions and 65 deletions
--- a/packages/pipeline/src/scripts/partition_nonfungible_dot_com_dump.ts
+++ b/packages/pipeline/src/scripts/partition_nonfungible_dot_com_dump.ts
@@ -0,0 +1,48 @@
+/**
+ * Needed because we store the initial dump of trades in S3, and some projects
+ * (namely cryptokitties) have dumps that are too big to be transferred easily
+ * as one big file to and from S3.  This script breaks apart a dump file into a
+ * set of files containing segments of the data.  The number of segments is
+ * based on S3_CHUNK_SIZES specified for each project, or "publisher" in their
+ * parlance, in ../../data_sources/nonfungible_dot_com/index.ts.
+ *
+ * Usage: $ node partition_nonfungible_dot_com_dump.ts publisher
+ * Example: $ node partition_nonfungible_dot_com_dump.ts cryptokitties
+ *
+ * Expects a to find on disk a data file named
+ * `sales_summary_${publisher}.json`, as emailed by Daniel of nonfungible.com.
+ *
+ * Writes to disk a set of files named `sales_summary_${publisher}${N}.json`.
+ *
+ * Probably need to use `node` with --max-old-space-size=1024 or maybe
+ * even more.
+ */
+
+import { readFileSync, writeFileSync } from 'fs';
+
+import { splitEvery } from 'ramda';
+
+import { logUtils } from '@0x/utils';
+
+import {
+    NonfungibleDotComHistoryResponse,
+    NonfungibleDotComTradeResponse,
+    S3_CHUNK_SIZES,
+} from '../data_sources/nonfungible_dot_com';
+
+(() => {
+    const publisher = process.argv[2];
+
+    const inputFilename = `sales_summary_${publisher}.json`;
+    logUtils.log(`Reading input file ${inputFilename}`);
+    const sourceJson: NonfungibleDotComHistoryResponse = JSON.parse(readFileSync(inputFilename).toString());
+
+    const chunkSize = S3_CHUNK_SIZES[publisher];
+    logUtils.log(`Splitting data into chunks of ${chunkSize} trades each`);
+    const chunks: NonfungibleDotComTradeResponse[][] = splitEvery(chunkSize, sourceJson.data);
+
+    logUtils.log(`Writing ${chunks.length} chunks to disk`);
+    for (let chunkIndex = 0; chunkIndex < chunks.length; chunkIndex++) {
+        writeFileSync(`sales_summary_${publisher}${chunkIndex}.json`, JSON.stringify(chunks[chunkIndex]));
+    }
+})();