Skip to content

Commit

Permalink
Swap IO to do blocks, fix downloads when no filings are present
Browse files Browse the repository at this point in the history
  • Loading branch information
tylerjthomas9 committed Dec 17, 2024
1 parent 6a3e5fa commit 31dafe5
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 20 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "ScrapeSEC"
uuid = "856806e7-be2f-4540-8165-3a51303b7af0"
authors = ["tylerjthomas9 <[email protected]>"]
version = "1.0.1"
version = "1.0.2"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
21 changes: 15 additions & 6 deletions src/download_filings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ function download_filing(
mkdir(company_folder)
end

f = open(new_file, "w")
write(f, clean_text(text))
close(f)
open(new_file, "w") do f
write(f, clean_text(text))
end

return nothing
end
Expand Down Expand Up @@ -92,13 +92,18 @@ function download_filings(
# download filings at 10 requests per second
sleep_time = 1 / download_rate

if skip_file
filenames = filter(file -> !isfile(joinpath(dest, replace(file, "edgar/data/" => ""))), filenames)
end

if isempty(filenames)
return nothing
end

job = addjob!(pbar; N=size(filenames, 1), description=pbar_desc)
start!(pbar)
for file in filenames
full_file = joinpath(dest, replace(file, "edgar/data/" => ""))
if isfile(full_file) && skip_file
continue
end

@async download_filing(file, full_file, dest; clean_text)

Expand Down Expand Up @@ -169,6 +174,10 @@ function download_filings(
end

df = DataFrame(CSV.File(metadata_file; delim="|"))
if isempty(df)
@warn "No filings found in metadata file: $metadata_file"
return nothing
end
df = df[(filing_types).(df[!, "Form Type"]), :]

download_filings(
Expand Down
26 changes: 13 additions & 13 deletions src/download_metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,27 +62,27 @@ function download_metadata(

HTTP.download(url, temp_zip; update_period=Inf)
zarchive = ZipFile.Reader(temp_zip)
for f in zarchive.files
@assert f.name == "master.idx"
out = open(temp_file, "w")
write(out, read(f, String))
close(out)
for zip_file in zarchive.files
@assert zip_file.name == "master.idx"
open(temp_file, "w") do f
write(f, read(zip_file, String))
end
end
close(zarchive)
rm(temp_zip)

f = open(temp_file, "r")
metadata = readlines(f)[10:end] # skip fluff at top
close(f)
metadata = open(temp_file, "r") do f
readlines(f)[10:end] # skip fluff at top
end
rm(temp_file)

f = open(full_file, "w")
for line in metadata
if occursin("|", line) # skip "----------" line
write(f, line * "\n")
open(full_file, "w") do f
for line in metadata
if occursin("|", line) # skip "----------" line
write(f, line * "\n")
end
end
end
close(f)

return nothing
end
Expand Down
5 changes: 5 additions & 0 deletions test/download_filings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ end
running_tests=true,
)
@test isfile("./metadata/1994-QTR4.tsv")
rm("./metadata/1994-QTR4.tsv")

# Test when metadata files are empty and no filings are downloaded
download_filings(1994, 1994; filing_types=["40-F"])

rm("./metadata/1994-QTR4.tsv")
# TODO: Is it safe to clear the temp dir? I dont want to accidently user files
end

0 comments on commit 31dafe5

Please sign in to comment.