Collect information of NAS members

3 min read

When submitting to PNAS, you’re required to suggest NAS members. Since there are so many of them, manually browsing their profiles on the website is incredibly tedious. To solve this, I had AI write a Python script to collect information for members in relevant fields.The results were great—check out the code below. Beyond just writing code, AI is becoming an autonomous agent through tools like OpenClaw, bridging the gap between instructions and results.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor

BASE = "https://nrc88.nas.edu"
SEARCH_URL = "https://nrc88.nas.edu/pnas_search/default.aspx"
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

def safe_request(url, retries=3):
    for i in range(retries):
        try:
            r = requests.get(url, headers=HEADERS, timeout=20)
            return r
        except Exception as e:
            print("Retry", i + 1, url)
            time.sleep(2)
    return None

def get_hidden_fields(soup):
    data = {}
    for tag in soup.select("input[type=hidden]"):
        data[tag.get("name")] = tag.get("value", "")
    return data

def get_page(session, discipline_id, page_index, soup):
    data = get_hidden_fields(soup)
    event_target = f"ucSearch$ucSearchResults$datalistPaging$ctl0{page_index}$linkButtonPage"
    data["__EVENTTARGET"] = event_target
    data["__EVENTARGUMENT"] = ""

    r = session.post(
        SEARCH_URL + f"?disciplineID={discipline_id}",
        data=data,
        headers=HEADERS,
        timeout=20
    )

    return BeautifulSoup(r.text, "html.parser")

def collect_editor_links(discipline_id):
    session = requests.Session()
    r = session.get(
        SEARCH_URL,
        params={"disciplineID": discipline_id},
        headers=HEADERS,
        timeout=20
    )
    soup = BeautifulSoup(r.text, "html.parser")
    editors = {}
    page_links = soup.select("#ucSearch_ucSearchResults_datalistPaging a")
    total_pages = len(page_links)
    print("Total pages:", total_pages)

    for page in range(total_pages):
        if page > 0:
            soup = get_page(session, discipline_id, page, soup)
            time.sleep(1)
        for a in soup.select("a[href*='memberDetails.aspx']"):
            name = a.text.strip()
            link = BASE + "/pnas_search/" + a["href"]
            editors[link] = name
            
    return editors

def parse_editor(link):
    try:
        r = safe_request(link)
        if not r:
            return None
        soup = BeautifulSoup(r.text, "html.parser")
        record = {
            "name": "",
            "location": "",
            "primary_field": "",
            "secondary_field": "",
            "election_citation": "",
            "research_interests": "",
            "profile_link": link
        }
        table = soup.find("table", class_="detailsTable")

        if not table:
            return record
        rows = table.find_all("tr")
        current_section = None
        
        for row in rows:
            name_cell = row.find("td", class_="detailsName")
            value_cell = row.find("td", class_="detailsValue")
            
            if name_cell and value_cell:

                key = name_cell.get_text(strip=True)
                value = value_cell.get_text(" ", strip=True)

                if key == "Name":
                    record["name"] = value
                elif key == "Location":
                    record["location"] = value
                elif key == "Primary Field":
                    record["primary_field"] = value
                elif key == "Secondary Field":
                    record["secondary_field"] = value
                    
            th = row.find("th")

            if th:
                title = th.get_text(strip=True)
                if "Election Citation" in title:
                    current_section = "election"
                elif "Research Interests" in title:
                    current_section = "research"
            long_td = row.find("td", class_="detailsLongValue")

            if long_td:
                text = long_td.get_text(" ", strip=True)
                if current_section == "election":
                    record["election_citation"] = text
                elif current_section == "research":
                    record["research_interests"] = text
        return record

    except Exception as e:
        print("Error:", link, e)
        return None

def main(discipline_ids):
    all_links = {}
    for d in discipline_ids:
        print("Scraping discipline:", d)
        editors = collect_editor_links(d)
        all_links.update(editors)
    all_links = list(set(all_links))
    print("Total editors:", len(all_links))
    results = []
    with ThreadPoolExecutor(max_workers=3) as executor:
        records = executor.map(parse_editor, all_links)
        for r in records:
            if r:
                results.append(r)
                
    df = pd.DataFrame(results)
    df.to_csv("./pnas_member_editors_full.csv", index=False)
    print("Saved:", len(df))
    
if __name__ == "__main__":
    discipline_ids = [24, 28, 52]
    main(discipline_ids)