UNCLASSIFIED - NO CUI

Skip to content
Snippets Groups Projects
Commit 5320c17f authored by Garrett Heaton's avatar Garrett Heaton
Browse files

Merge branch '375-check-for-bad-training-eval-covered-links-in-pipeline' into 'master'

Resolve "Check for bad training/eval covered links in pipeline"

Closes #46 and #375

See merge request 90cos/mttl!300
parents f485adc9 7cbb0c8c
No related branches found
No related tags found
1 merge request!300Resolve "Check for bad training/eval covered links in pipeline"
Pipeline #90327 passed with warnings
......@@ -85,7 +85,22 @@ validate_tasks:
- ValidateTasksLog.json
when: always
expire_in: 1 days
validate_rel_links_urls:
stage: validate
services:
- mongo:latest
<<: *mongodb_import
script:
# validate training rel-link url mappings
- python3 ./scripts/validate/validate_rel_link_urls.py -t 25
allow_failure: true
artifacts:
paths:
- validate_rel_link_urls.json
when: always
expire_in: 3 day
validate_trn_rel_links:
stage: validate
script:
......
#!/usr/bin/python3
import os
import sys
import json
import threading
import queue
import argparse
import requests
import pymongo
from bson.objectid import ObjectId
HOST = os.getenv('MONGO_HOST', 'localhost')
PORT = int(os.getenv('MONGO_PORT', '27017'))
client = pymongo.MongoClient(HOST, PORT)
db = client.mttl
reqs = db.requirements
rls = db.rel_links
q = queue.Queue()
num_threads = 10
mutex = threading.Lock()
error_log = {}
def log_error(url:str, msg, bad_url=False):
for rel_link in rls.find({'KSATs.url': url}):
_id = str(rel_link['_id'])
mutex.acquire()
if _id not in error_log:
error_log[_id] = {}
if not bad_url:
error_log[_id].update({
url: msg
})
else:
if 'bad_urls' not in error_log[_id]:
error_log[_id]['bad_urls'] = []
error_log[_id]['bad_urls'].append(url)
mutex.release()
def validate_url_data(url:str):
try:
res = requests.head(url, allow_redirects=True)
if res.status_code != 200:
log_error(url, res.status_code)
except requests.exceptions.MissingSchema as err:
log_error(url, 'Bad URL!', bad_url=True)
except requests.exceptions as err:
print('Request Error:', err)
def worker():
while True:
url = q.get()
validate_url_data(url)
q.task_done()
def main():
parser = argparse.ArgumentParser(description='Validate rel-link URLs')
parser.add_argument('-t', '--num-threads', type=int, default=num_threads, help='specify number of threads to execute')
parsed_args = parser.parse_args()
# turn-on the worker threads
for i in range(num_threads):
threading.Thread(target=worker, daemon=True).start()
# send thirty task requests to the worker
for url in db.rel_links.distinct('KSATs.url'):
q.put(url)
# block until all tasks are done
q.join()
if len(error_log.keys()) > 0:
with open('./validate_rel_link_urls.json', 'w') as logfile:
json.dump(error_log, logfile, indent=4)
exit(1)
if __name__ == "__main__":
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment