Skip to content

Commit

Permalink
Integrate building database
Browse files Browse the repository at this point in the history
  • Loading branch information
mgao6767 committed May 1, 2023
1 parent 981b712 commit b2acc65
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ sample/

# Database
*.db
*.sqlite3

# VS Code, etc.
.vscode/
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ pip install edgar-analyzer

### Setup

Download index files
**Download index files**, which contain the firm CIK, name, filing date, type, and URL of the filing.

```bash
edgar-analyzer download_index --user_agent "MyCompany name@mycompany.com" --output "./index"
```

Download filings (to be integrated)
**Build a database** of the previously download index files for more efficient queries.

```bash
edgar_analyzer download_filings
edgar_analyzer build_database --inputdir "./index" --database "edgar-idx.sqlite3"
```

Build database (to be integrated)
Download filings (to be integrated)

```bash
edgar_analyzer build_database
edgar_analyzer download_filings
```

### Run specific jobs
Expand Down
2 changes: 1 addition & 1 deletion edgaranalyzer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import types
import sys

__version__ = "0.0.1rc2"
__version__ = "0.0.1rc3"
__description__ = "Textual analysis on SEC filings from EDGAR"
__author__ = "Mingze Gao"
__author_email__ = "mingze.gao@sydney.edu.au"
Expand Down
47 changes: 46 additions & 1 deletion edgaranalyzer/cmd_build_database.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,50 @@
import argparse
import os
import pathlib
import sqlite3

EDGAR_BASE = "https://www.sec.gov/Archives/"


def cmd(args: argparse.Namespace):
raise NotImplementedError
inputdir = pathlib.Path(args.inputdir).resolve().as_posix()

assert os.path.exists(inputdir)

dbpath = pathlib.Path(args.database).resolve().as_posix()
if not os.path.exists(os.path.dirname(dbpath)):
os.makedirs(os.path.dirname(dbpath))

conn = sqlite3.connect(dbpath)
c = conn.cursor()
c.execute(
"""CREATE TABLE IF NOT EXISTS edgar_idx
(cik TEXT, firm_name TEXT, file_type TEXT, date DATE, url TEXT);"""
)

for dirpath, _, filenames in os.walk(inputdir):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
print(f"Populating database using {filepath}")
with open(filepath, "r") as f:
lines = f.readlines()
data = [parse(line) for line in lines]
c.executemany(
"INSERT OR IGNORE INTO edgar_idx \
(cik, firm_name, file_type, date, url) VALUES (?,?,?,?,?)",
data,
)

conn.commit()
conn.close()


def parse(line):
# each line: "cik|firm_name|file_type|date|url_txt|url_html"
# an example:
# "99780|TRINITY INDUSTRIES INC|8-K|2020-01-15|edgar/data/99780/0000099780-\
# 20-000008.txt|edgar/data/99780/0000099780-20-000008-index.html"
line = tuple(line.split("|")[:5])
l = list(line)
l[-1] = EDGAR_BASE + l[-1]
return tuple(l)
15 changes: 15 additions & 0 deletions edgaranalyzer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,21 @@ def init_argparse() -> argparse.ArgumentParser:
help="Sample loan contracts from filings",
)

parser_build_db.add_argument(
"-db",
"--database",
metavar="database",
default="edgar-idx.sqlite3",
help="output sqlite database to store results. Defaults to `edgar-idx.sqlite3`",
)
required = parser_build_db.add_argument_group("required named arguments")
required.add_argument(
"-i",
"--inputdir",
metavar="inputdir",
help="input directory of index files from `download_index`",
)

# subparser for `download_index` subcommand
required = parser_download.add_argument_group("required named arguments")
required.add_argument(
Expand Down

0 comments on commit b2acc65

Please sign in to comment.