-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathdl.py
executable file
·51 lines (44 loc) · 1.48 KB
/
dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/python
import argparse
import csv
import os
from pathlib import Path
import pandas as pd
import requests
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("filing", type=str)
parser.add_argument("folder", type=str)
user_agent = {"User-agent": "Mozilla/5.0"}
args = parser.parse_args()
filing = args.filing
folder = args.folder
to_dl = []
with open("full_index.csv", "r") as f:
reader = csv.DictReader(f)
for row in reader:
if filing in row["form"]:
to_dl.append(row)
len_ = len(to_dl)
print(len_)
print("start to download")
for n, row in enumerate(to_dl):
print(f"{n} out of {len_}")
cik = row["cik"].strip()
date = row["date"].strip()
year = row["date"].split("-")[0].strip()
month = row["date"].split("-")[1].strip()
url = row["url"].strip()
accession = url.split(".")[0].split("-")[-1]
Path(f"./{folder}/{year}_{month}").mkdir(parents=True, exist_ok=True)
file_path = f"./{folder}/{year}_{month}/{cik}_{date}_{accession}.txt"
if os.path.exists(file_path):
continue
try:
txt = requests.get(
f"https://www.sec.gov/Archives/{url}", headers=user_agent, timeout=60
).text
with open(file_path, "w", errors="ignore") as f:
f.write(txt)
except:
print(f"{cik}, {date} failed to download")