Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement: Major code improvements and optimizations #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,12 @@ asyncio>=3.4.3

# Utilities
tqdm>=4.66.1
PyYAML>=6.0.1
PyYAML>=6.0.1
requests>=2.26.0
backoff>=2.1.2
logging>=0.5.1.2
typing>=3.7.4.3
aiohttp>=3.8.0
cachetools>=5.0.0
ratelimit>=2.2.1
backoff>=2.1.2
51 changes: 45 additions & 6 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

# Create directories if they don't exist
for directory in [INPUT_DIR, RAW_DIR, LOGS_DIR, SECTIONS_DIR, REPORTS_DIR,
REPORTS_INDIVIDUAL_DIR, REPORTS_CROSS_CASE_DIR, REPORTS_EXECUTIVE_DIR]:
REPORTS_INDIVIDUAL_DIR, REPORTS_CROSS_CASE_DIR, REPORTS_EXECUTIVE_DIR]:
directory.mkdir(parents=True, exist_ok=True)

# Claude settings
Expand All @@ -35,10 +35,49 @@
REQUEST_TIMEOUT = 30
RETRY_DELAY = 1

# Logging format
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(message)s'

# Add to existing config.py
# FireCrawl API settings
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
if not FIRECRAWL_API_KEY:
raise ValueError("FIRECRAWL_API_KEY environment variable is not set")
raise ValueError("FIRECRAWL_API_KEY environment variable is not set")

# Logging configuration
LOGGING_CONFIG = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
},
},
'handlers': {
'default': {
'level': 'INFO',
'formatter': 'standard',
'class': 'logging.StreamHandler',
'stream': 'ext://sys.stdout',
},
'file': {
'level': 'INFO',
'formatter': 'standard',
'class': 'logging.FileHandler',
'filename': LOGS_DIR / 'crawler.log',
'mode': 'a',
},
},
'loggers': {
'': { # root logger
'handlers': ['default', 'file'],
'level': 'INFO',
'propagate': True
}
}
}

# Crawler configurations
CRAWLER_CONFIG = {
'max_retries': 3,
'timeout': 30,
'rate_limit_pause': 1.0,
'max_pages': 100,
'user_agent': 'AI Case Study Analyzer Bot 1.0',
}
34 changes: 34 additions & 0 deletions src/firecrawl_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Firecrawl API configuration settings"""

FIRECRAWL_CONFIG = {
# API Settings
"base_url": "https://api.firecrawl.com/v1",

# Cache Settings
"cache_ttl": 3600, # Cache lifetime in seconds
"cache_maxsize": 1000, # Maximum cache entries

# Rate Limiting
"rate_limit_calls": 60, # Calls allowed per period
"rate_limit_period": 60, # Period in seconds

# Concurrent Requests
"max_concurrent": 5, # Maximum concurrent requests

# Timeouts
"request_timeout": 30, # Request timeout in seconds
"scrape_timeout": 30000, # Scrape timeout in milliseconds

# Retry Settings
"max_retries": 3,
"retry_delay": 1, # Initial retry delay in seconds

# Output Formats
"default_formats": ["markdown"],

# Content Settings
"only_main_content": True,
"include_subdomains": True,
"ignore_sitemap": False,
"url_limit": 5000
}
Loading