solorpower_crawler/crawlers/cmsolar.py

513 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ==========================================
# crawlers/cmsolar.py - CMSolar 크롤러 (10호기)
# HTML 테이블 파싱 방식
# ==========================================
import requests
import re
from .base import create_session, safe_float
def fetch_data(plant_info):
"""
CMSolar 발전소 데이터 수집
"""
plant_id = plant_info.get('id', 'cmsolar-10')
auth = plant_info.get('auth', {})
system = plant_info.get('system', {})
company_name = plant_info.get('company_name', '함안햇빛발전소')
plant_name = plant_info.get('name', '10호기')
login_id = auth.get('login_id', '')
login_pw = auth.get('login_pw', '')
site_no = auth.get('site_no', '')
login_url = system.get('login_url', '')
data_url = system.get('data_url', '')
session = create_session()
headers = {
'User-Agent': 'Mozilla/5.0',
'Content-Type': 'application/x-www-form-urlencoded'
}
# 로그인
login_data = {
'login_id': login_id,
'login_pw': login_pw,
'site_no': site_no
}
try:
res = session.post(login_url, data=login_data, headers=headers)
if res.status_code != 200:
return []
# Site selection (Required for idx_ok.php)
base_url = system.get('base_url', 'http://www.cmsolar2.kr')
change_url = f"{base_url}/change.php?site={site_no}"
session.get(change_url, headers=headers)
except Exception as e:
print(f"{plant_name} 접속 에러: {e}")
return []
# 데이터 요청 (JSON Endpoint)
target_url = f"{base_url}/plant/sub/idx_ok.php?mode=getPlant"
try:
res = session.get(target_url, headers=headers)
if res.status_code == 200:
# Handle potential encoding issues if needed, though requests usually guesses well
if res.encoding is None:
res.encoding = 'utf-8'
data = res.json()
# Parsing logic for [{"plant": {...}}] structure
if isinstance(data, list) and len(data) > 0:
plant_data = data[0].get('plant', {})
# Unit Conversion: W -> kW
curr_kw = safe_float(plant_data.get('now', 0)) / 1000.0
today_kwh = safe_float(plant_data.get('today', 0)) / 1000.0
# Status check
is_error = int(plant_data.get('inv_error', 0))
status = "🟢 정상" if is_error == 0 else "🔴 점검/고장"
# 0kW during day is suspicious but night is normal.
# If needed, override status based on time, but sticking to error flag is safer.
if curr_kw == 0 and status == "🟢 정상":
# Optional: Check if night time?
pass
return [{
'id': plant_id,
'name': f'{company_name} {plant_name}',
'kw': curr_kw,
'today': today_kwh,
'status': status
}]
else:
print(f"{plant_name} 데이터 형식 오류: {data}")
return []
else:
return []
except Exception as e:
print(f"{plant_name} 에러: {e}")
return []
def fetch_history_hourly(plant_info, start_date, end_date):
"""
CMSolar 발전소의 시간대별 과거 데이터 수집
실제 엔드포인트: /plant/sub/report_ok.php (HTML 테이블 응답)
파라미터: mode=getPowers&type=daily&device=total&start=YYYY-MM-DD&money=
"""
from datetime import datetime, timedelta
results = []
plant_id = plant_info.get('id', 'cmsolar-10')
auth = plant_info.get('auth', {})
system = plant_info.get('system', {})
plant_name = plant_info.get('name', '10호기')
login_id = auth.get('login_id', '')
login_pw = auth.get('login_pw', '')
site_no = auth.get('site_no', '')
login_url = system.get('login_url', '')
# 실제 데이터 엔드포인트
base_url = system.get('api_url', 'http://www.cmsolar2.kr')
data_url = f"{base_url}/plant/sub/report_ok.php"
session = create_session()
print(f"\n{'='*60}")
print(f"[CMSolar History] {plant_name} ({start_date} ~ {end_date})")
print(f"{'='*60}")
headers = {
'User-Agent': 'Mozilla/5.0',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
login_data = {
'login_id': login_id,
'login_pw': login_pw,
'site_no': site_no
}
try:
res = session.post(login_url, data=login_data, headers=headers)
if res.status_code == 200:
print(" ✓ Login successful")
else:
print(" ✗ Login failed")
return results
except Exception as e:
print(f" ✗ Login error: {e}")
return results
# 사이트 선택 (필수!)
try:
change_url = f"{base_url}/change.php?site={site_no}"
session.get(change_url, headers=headers)
print(" ✓ Site selected")
except Exception as e:
print(f" ✗ Site selection error: {e}")
return results
# 날짜 반복
current_date = datetime.strptime(start_date, '%Y-%m-%d')
end_dt = datetime.strptime(end_date, '%Y-%m-%d')
while current_date <= end_dt:
date_str = current_date.strftime('%Y-%m-%d')
# 실제 확인된 시간별 엔드포인트 (type=daily는 하루 치 시간별 데이터 반환)
params = {
'mode': 'getPowers',
'type': 'daily',
'device': 'total',
'start': date_str,
'money': ''
}
try:
res = session.get(data_url, params=params, headers=headers, timeout=10)
res.encoding = 'utf-8'
if res.status_code == 200:
# HTML 테이블 파싱
html = res.text
# <tbody> 안의 <tr> 태그 찾기
tbody_match = re.search(r'<tbody>(.*?)</tbody>', html, re.DOTALL)
if tbody_match:
tbody_content = tbody_match.group(1)
# 각 <tr> 파싱 (시간과 발전량)
# <tr class="odd"><td>9</td><td>3.0</td>...
tr_pattern = r'<tr[^>]*>\s*<td>(\d+)</td>\s*<td>([\d.]+)</td>'
matches = re.findall(tr_pattern, tbody_content)
if matches:
print(f" ✓ Found {len(matches)} hourly records for {date_str}")
for hour, kwh in matches:
generation_kwh = safe_float(kwh)
timestamp = f"{date_str} {hour.zfill(2)}:00:00"
results.append({
'plant_id': plant_id,
'timestamp': timestamp,
'generation_kwh': generation_kwh,
'current_kw': 0
})
else:
print(f" ⚠ No data for {date_str}")
else:
print(f" ⚠ No tbody found for {date_str}")
else:
print(f" ✗ HTTP {res.status_code}")
except Exception as e:
print(f" ✗ Error for {date_str}: {e}")
current_date += timedelta(days=1)
print(f"\n{'='*60}")
print(f"[Total] Collected {len(results)} hourly records")
print(f"{'='*60}\n")
return results
def fetch_history_daily(plant_info, start_date, end_date):
"""
CMSolar 발전소의 일별 과거 데이터 수집
실제 엔드포인트: /plant/sub/report_ok.php (HTML 테이블 응답)
파라미터: mode=getPowers&type=month&device=total&start=YYYY-MM-01&money=
"""
from datetime import datetime
from dateutil.relativedelta import relativedelta
results = []
plant_id = plant_info.get('id', 'cmsolar-10')
auth = plant_info.get('auth', {})
system = plant_info.get('system', {})
plant_name = plant_info.get('name', '10호기')
login_id = auth.get('login_id', '')
login_pw = auth.get('login_pw', '')
site_no = auth.get('site_no', '')
login_url = system.get('login_url', '')
# 실제 데이터 엔드포인트
base_url = system.get('api_url', 'http://www.cmsolar2.kr')
data_url = f"{base_url}/plant/sub/report_ok.php"
session = create_session()
print(f"\n{'='*60}")
print(f"[CMSolar Daily] {plant_name} ({start_date} ~ {end_date})")
print(f"{'='*60}")
headers = {
'User-Agent': 'Mozilla/5.0',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
login_data = {
'login_id': login_id,
'login_pw': login_pw,
'site_no': site_no
}
try:
res = session.post(login_url, data=login_data, headers=headers)
if res.status_code == 200:
print(" ✓ Login successful")
else:
print(" ✗ Login failed")
return results
except Exception as e:
print(f" ✗ Login error: {e}")
return results
# 사이트 선택 (필수!)
try:
change_url = f"{base_url}/change.php?site={site_no}"
session.get(change_url, headers=headers)
print(" ✓ Site selected")
except Exception as e:
print(f" ✗ Site selection error: {e}")
return results
# 월 단위로 반복 (type=month는 한 달 치 일별 데이터 반환)
current_date = datetime.strptime(start_date, '%Y-%m-%d')
end_dt = datetime.strptime(end_date, '%Y-%m-%d')
while current_date <= end_dt:
month_start = current_date.strftime('%Y-%m-01')
year = current_date.year
month = current_date.month
# 실제 확인된 일별 엔드포인트 (type=month)
params = {
'mode': 'getPowers',
'type': 'month',
'device': 'total',
'start': month_start,
'money': ''
}
try:
res = session.get(data_url, params=params, headers=headers, timeout=10)
res.encoding = 'utf-8'
if res.status_code == 200:
# HTML 테이블 파싱
html = res.text
# <tbody> 안의 <tr> 태그 찾기
tbody_match = re.search(r'<tbody>(.*?)</tbody>', html, re.DOTALL)
if tbody_match:
tbody_content = tbody_match.group(1)
# 각 <tr> 파싱 (날짜와 발전량)
# <tr class="odd"><td>1</td><td>136.00</td>...
tr_pattern = r'<tr[^>]*>\s*<td>(\d+)</td>\s*<td>([\d.,]+)</td>'
matches = re.findall(tr_pattern, tbody_content)
if matches:
print(f" ✓ Found {len(matches)} daily records for {month_start[:7]}")
for day, kwh in matches:
# 쉼표 제거
kwh_clean = kwh.replace(',', '')
generation_kwh = safe_float(kwh_clean)
date_str = f"{year:04d}-{month:02d}-{int(day):02d}"
# 날짜 범위 필터링
if date_str >= start_date and date_str <= end_date:
results.append({
'plant_id': plant_id,
'date': date_str,
'generation_kwh': generation_kwh,
'current_kw': 0
})
print(f"{date_str}: {generation_kwh:.2f}kWh")
else:
print(f" ⚠ No tbody found for {month_start[:7]}")
else:
print(f" ✗ HTTP {res.status_code} for {month_start[:7]}")
except Exception as e:
print(f" ✗ Error for {month_start[:7]}: {e}")
# 다음 달로 이동
current_date = (current_date.replace(day=1) + relativedelta(months=1))
print(f"[Total] Collected {len(results)} daily records\n")
return results
def fetch_history_monthly(plant_info, start_month, end_month):
"""
CMSolar 발전소의 월별 과거 데이터 수집
실제 엔드포인트: /plant/sub/report_ok.php (HTML 테이블 응답)
파라미터: mode=getPowers&type=year&device=total&start=YYYY-01-01&money=
"""
from datetime import datetime
from dateutil.relativedelta import relativedelta
results = []
plant_id = plant_info.get('id', 'cmsolar-10')
auth = plant_info.get('auth', {})
system = plant_info.get('system', {})
plant_name = plant_info.get('name', '10호기')
# 시작일자 체크
plant_start_date = plant_info.get('start_date', '2020-08-31')
plant_start_month = plant_start_date[:7] # YYYY-MM
# 실제 시작 월은 발전소 가동일 이후로 제한
if start_month < plant_start_month:
actual_start = plant_start_month
print(f" 발전소 가동일({plant_start_date}) 이후부터 수집: {actual_start}")
else:
actual_start = start_month
login_id = auth.get('login_id', '')
login_pw = auth.get('login_pw', '')
site_no = auth.get('site_no', '')
login_url = system.get('login_url', '')
# 실제 데이터 엔드포인트
base_url = system.get('api_url', 'http://www.cmsolar2.kr')
data_url = f"{base_url}/plant/sub/report_ok.php"
session = create_session()
print(f"\n{'='*60}")
print(f"[CMSolar Monthly] {plant_name} ({actual_start} ~ {end_month})")
print(f"{'='*60}")
headers = {
'User-Agent': 'Mozilla/5.0',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
login_data = {
'login_id': login_id,
'login_pw': login_pw,
'site_no': site_no
}
try:
res = session.post(login_url, data=login_data, headers=headers)
if res.status_code == 200:
print(" ✓ Login successful")
else:
print(" ✗ Login failed")
return results
except Exception as e:
print(f" ✗ Login error: {e}")
return results
# 사이트 선택 (필수!)
try:
change_url = f"{base_url}/change.php?site={site_no}"
session.get(change_url, headers=headers)
print(" ✓ Site selected")
except Exception as e:
print(f" ✗ Site selection error: {e}")
return results
# 연도별로 반복 (type=year는 한 해 치 월별 데이터 반환)
current_month = datetime.strptime(actual_start, '%Y-%m')
end_month_dt = datetime.strptime(end_month, '%Y-%m')
processed_years = set()
while current_month <= end_month_dt:
year = current_month.year
# 이미 처리한 연도는 스킵
if year in processed_years:
current_month += relativedelta(months=1)
continue
processed_years.add(year)
year_start = f"{year}-01-01"
# 실제 확인된 월별 엔드포인트 (type=year)
params = {
'mode': 'getPowers',
'type': 'year',
'device': 'total',
'start': year_start,
'money': ''
}
try:
res = session.get(data_url, params=params, headers=headers, timeout=10)
res.encoding = 'utf-8'
if res.status_code == 200:
# HTML 테이블 파싱
html = res.text
# <tbody> 안의 <tr> 태그 찾기
tbody_match = re.search(r'<tbody>(.*?)</tbody>', html, re.DOTALL)
if tbody_match:
tbody_content = tbody_match.group(1)
# 각 <tr> 파싱 (월과 발전량)
# <tr class="even"><td>1</td><td>2,836.00</td>...
tr_pattern = r'<tr[^>]*>\s*<td>(\d+)</td>\s*<td>([\d.,]+)</td>'
matches = re.findall(tr_pattern, tbody_content)
if matches:
year_count = 0
for month, kwh in matches:
# 쉼표 제거
kwh_clean = kwh.replace(',', '')
generation_kwh = safe_float(kwh_clean)
month_str = f"{year:04d}-{int(month):02d}"
# 월 범위 필터링
if month_str >= actual_start and month_str <= end_month:
results.append({
'plant_id': plant_id,
'month': month_str,
'generation_kwh': generation_kwh
})
print(f"{month_str}: {generation_kwh:.1f}kWh")
year_count += 1
if year_count > 0:
print(f" → Collected {year_count} months from {year}")
else:
print(f" ⚠ No tbody found for year {year}")
else:
print(f" ✗ HTTP {res.status_code} for year {year}")
except Exception as e:
print(f" ✗ Error for year {year}: {e}")
# 다음 연도로 이동
current_month = current_month.replace(year=year+1, month=1)
print(f"[Total] Collected {len(results)} monthly records\n")
return results